{ "best_metric": null, "best_model_checkpoint": null, "episode": 16000, "epoch": 0.09586463912954908, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "episode": 16, "epoch": 9.586463912954908e-05, "loss/policy_avg": 0.015691569074988365, "lr": 1e-05, "objective/entropy": 136.889404296875, "objective/kl": 13.172518730163574, "objective/non_score_reward": -0.6586259603500366, "objective/rlhf_reward": -1.2559016580260813, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 330.0568542480469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.75, "step": 0, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999426007270813 }, { "episode": 32, "epoch": 0.00019172927825909816, "loss/policy_avg": 0.021727558225393295, "lr": 9.999360940695298e-06, "objective/entropy": -4.705432891845703, "objective/kl": 4.4086012840271, "objective/non_score_reward": -0.22043009102344513, "objective/rlhf_reward": 0.49688179692854306, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 25.247615814208984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4375, "step": 1, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0005669593811035 }, { "episode": 48, "epoch": 0.00028759391738864725, "loss/policy_avg": 0.05422616004943848, "lr": 9.998721881390595e-06, "objective/entropy": 26.511795043945312, "objective/kl": 10.364278793334961, "objective/non_score_reward": -0.5182140469551086, "objective/rlhf_reward": -0.6222579917923059, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 174.7788543701172, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6328125, "step": 2, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001580238342285 }, { "episode": 64, "epoch": 0.0003834585565181963, "loss/policy_avg": 0.1031150370836258, "lr": 9.99808282208589e-06, "objective/entropy": -6.2874298095703125, "objective/kl": 7.10389518737793, "objective/non_score_reward": -0.35519474744796753, "objective/rlhf_reward": 0.24108044284523888, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 107.51742553710938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.806640625, "step": 3, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999995231628418 }, { "episode": 80, "epoch": 0.0004793231956477454, "loss/policy_avg": 0.020609447732567787, "lr": 9.997443762781187e-06, "objective/entropy": 63.54547882080078, "objective/kl": 1.458254337310791, "objective/non_score_reward": -0.07291271537542343, "objective/rlhf_reward": 1.224120924828116, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 14.240117073059082, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4150390625, "step": 4, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000563383102417 }, { "episode": 96, "epoch": 0.0005751878347772945, "loss/policy_avg": 0.1277482807636261, "lr": 9.996804703476484e-06, "objective/entropy": 55.068546295166016, "objective/kl": 8.753851890563965, "objective/non_score_reward": -0.43769264221191406, "objective/rlhf_reward": -0.37216834077010735, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 100.08578491210938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.447265625, "step": 5, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999474048614502 }, { "episode": 112, "epoch": 0.0006710524739068436, "loss/policy_avg": 0.3148539662361145, "lr": 9.99616564417178e-06, "objective/entropy": 21.463600158691406, "objective/kl": 9.847577095031738, "objective/non_score_reward": -0.4923788607120514, "objective/rlhf_reward": -0.02210425861352272, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 82.89840698242188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.619140625, "step": 6, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998772382736206 }, { "episode": 128, "epoch": 0.0007669171130363926, "loss/policy_avg": -9.760260581970215e-06, "lr": 9.995526584867077e-06, "objective/entropy": 43.514984130859375, "objective/kl": 6.468422889709473, "objective/non_score_reward": -0.3234211802482605, "objective/rlhf_reward": 0.18726797867262368, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 53.660911560058594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.595703125, "step": 7, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0024185180664062 }, { "episode": 144, "epoch": 0.0008627817521659417, "loss/policy_avg": 0.07420124113559723, "lr": 9.994887525562374e-06, "objective/entropy": 111.558837890625, "objective/kl": 5.765064716339111, "objective/non_score_reward": -0.2882532477378845, "objective/rlhf_reward": 0.7943982454372089, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 38.34186935424805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4462890625, "step": 8, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9975563287734985 }, { "episode": 160, "epoch": 0.0009586463912954908, "loss/policy_avg": 0.22252294421195984, "lr": 9.99424846625767e-06, "objective/entropy": 99.2086181640625, "objective/kl": 8.770297050476074, "objective/non_score_reward": -0.4385148584842682, "objective/rlhf_reward": -0.35405938923358926, "objective/scores": 0.35, "policy/approxkl_avg": 98.07421112060547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.75, "step": 9, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9961905479431152 }, { "episode": 176, "epoch": 0.0010545110304250398, "loss/policy_avg": 0.05278925597667694, "lr": 9.993609406952966e-06, "objective/entropy": 192.25936889648438, "objective/kl": 5.483057975769043, "objective/non_score_reward": -0.27415287494659424, "objective/rlhf_reward": 1.3033885151147842, "objective/scores": 0.6, "policy/approxkl_avg": 54.852699279785156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.73046875, "step": 10, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0001378059387207 }, { "episode": 192, "epoch": 0.001150375669554589, "loss/policy_avg": 0.01604432426393032, "lr": 9.992970347648263e-06, "objective/entropy": 91.4354476928711, "objective/kl": 1.6482281684875488, "objective/non_score_reward": -0.08241140842437744, "objective/rlhf_reward": 1.1513069728358984, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 12.662862777709961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5390625, "step": 11, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994404315948486 }, { "episode": 208, "epoch": 0.001246240308684138, "loss/policy_avg": 0.17367278039455414, "lr": 9.992331288343558e-06, "objective/entropy": 148.37680053710938, "objective/kl": 9.977045059204102, "objective/non_score_reward": -0.4988522529602051, "objective/rlhf_reward": -0.4796372515880427, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 132.6361083984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4619140625, "step": 12, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9963808059692383 }, { "episode": 224, "epoch": 0.0013421049478136871, "loss/policy_avg": -0.12138635665178299, "lr": 9.991692229038855e-06, "objective/entropy": -70.20156860351562, "objective/kl": 3.8376624584198, "objective/non_score_reward": -0.1918831169605255, "objective/rlhf_reward": 0.6324675619602202, "objective/scores": 0.35, "policy/approxkl_avg": 15.127391815185547, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.505859375, "step": 13, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.017153739929199 }, { "episode": 240, "epoch": 0.001437969586943236, "loss/policy_avg": 0.1106414794921875, "lr": 9.991053169734152e-06, "objective/entropy": 129.54013061523438, "objective/kl": 12.085613250732422, "objective/non_score_reward": -0.6042807102203369, "objective/rlhf_reward": -0.6837895224491755, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 178.22561645507812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5390625, "step": 14, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999481201171875 }, { "episode": 256, "epoch": 0.0015338342260727853, "loss/policy_avg": 0.01672934927046299, "lr": 9.990414110429449e-06, "objective/entropy": 177.98126220703125, "objective/kl": 7.125063896179199, "objective/non_score_reward": -0.3562532067298889, "objective/rlhf_reward": -0.025012841820716947, "objective/scores": 0.35, "policy/approxkl_avg": 91.47238159179688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.716796875, "step": 15, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000582218170166 }, { "episode": 272, "epoch": 0.0016296988652023342, "loss/policy_avg": 0.14258402585983276, "lr": 9.989775051124744e-06, "objective/entropy": 197.2217559814453, "objective/kl": 12.70147705078125, "objective/non_score_reward": -0.6350738406181335, "objective/rlhf_reward": -1.1616931343949852, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 84.26277160644531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.630859375, "step": 16, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9964921474456787 }, { "episode": 288, "epoch": 0.0017255635043318834, "loss/policy_avg": -0.0007228106260299683, "lr": 9.989135991820041e-06, "objective/entropy": -9.756143569946289, "objective/kl": 7.940765380859375, "objective/non_score_reward": -0.3970382809638977, "objective/rlhf_reward": -0.07238138595455501, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 42.61369323730469, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.64453125, "step": 17, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0011234283447266 }, { "episode": 304, "epoch": 0.0018214281434614326, "loss/policy_avg": 0.13892704248428345, "lr": 9.988496932515338e-06, "objective/entropy": 14.549068450927734, "objective/kl": 9.783748626708984, "objective/non_score_reward": -0.48918741941452026, "objective/rlhf_reward": -0.5781475538886606, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 73.81009674072266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.607421875, "step": 18, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998319149017334 }, { "episode": 320, "epoch": 0.0019172927825909815, "loss/policy_avg": 0.12347989529371262, "lr": 9.987857873210635e-06, "objective/entropy": 197.0328369140625, "objective/kl": 9.07555103302002, "objective/non_score_reward": -0.453777551651001, "objective/rlhf_reward": -0.15325071436225013, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 74.28388214111328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5625, "step": 19, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.001952648162842 }, { "episode": 336, "epoch": 0.0020131574217205307, "loss/policy_avg": 0.06666804850101471, "lr": 9.987218813905932e-06, "objective/entropy": 180.56707763671875, "objective/kl": 10.346174240112305, "objective/non_score_reward": -0.5173087120056152, "objective/rlhf_reward": -0.6454025848704257, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 88.01742553710938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.595703125, "step": 20, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9958171844482422 }, { "episode": 352, "epoch": 0.0021090220608500796, "loss/policy_avg": 0.12632718682289124, "lr": 9.986579754601228e-06, "objective/entropy": 165.49900817871094, "objective/kl": 10.707776069641113, "objective/non_score_reward": -0.5353888273239136, "objective/rlhf_reward": -0.7629530663169442, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 118.42108917236328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.78125, "step": 21, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9964122772216797 }, { "episode": 368, "epoch": 0.0022048866999796286, "loss/policy_avg": 0.012576747685670853, "lr": 9.985940695296524e-06, "objective/entropy": -133.83059692382812, "objective/kl": 6.06254768371582, "objective/non_score_reward": -0.3031274080276489, "objective/rlhf_reward": 0.21132251183215, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.497255325317383, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.552734375, "step": 22, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0017709732055664 }, { "episode": 384, "epoch": 0.002300751339109178, "loss/policy_avg": 0.21566970646381378, "lr": 9.98530163599182e-06, "objective/entropy": 80.05180358886719, "objective/kl": 18.019107818603516, "objective/non_score_reward": -0.9009554386138916, "objective/rlhf_reward": -2.1799896850186267, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 244.3957061767578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.72265625, "step": 23, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975435733795166 }, { "episode": 400, "epoch": 0.002396615978238727, "loss/policy_avg": 0.21825431287288666, "lr": 9.984662576687117e-06, "objective/entropy": 22.858154296875, "objective/kl": 7.889187812805176, "objective/non_score_reward": -0.39445942640304565, "objective/rlhf_reward": 0.5448686011871957, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 45.33286666870117, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.54296875, "step": 24, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998900890350342 }, { "episode": 416, "epoch": 0.002492480617368276, "loss/policy_avg": 0.2645857036113739, "lr": 9.984023517382414e-06, "objective/entropy": 37.619895935058594, "objective/kl": 11.23090934753418, "objective/non_score_reward": -0.5615454316139221, "objective/rlhf_reward": 0.15381827354431143, "objective/scores": 0.6, "policy/approxkl_avg": 88.95787811279297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.552734375, "step": 25, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996498703956604 }, { "episode": 432, "epoch": 0.002588345256497825, "loss/policy_avg": 0.04753335565328598, "lr": 9.983384458077711e-06, "objective/entropy": 156.34921264648438, "objective/kl": 7.371222496032715, "objective/non_score_reward": -0.36856111884117126, "objective/rlhf_reward": -0.14873159292332616, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 35.437461853027344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 26, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979305267333984 }, { "episode": 448, "epoch": 0.0026842098956273742, "loss/policy_avg": -0.010932949371635914, "lr": 9.982745398773006e-06, "objective/entropy": 16.393407821655273, "objective/kl": 16.967132568359375, "objective/non_score_reward": -0.8483567237854004, "objective/rlhf_reward": -2.051791122465759, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 207.71142578125, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.564453125, "step": 27, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9935011863708496 }, { "episode": 464, "epoch": 0.002780074534756923, "loss/policy_avg": 0.23893436789512634, "lr": 9.982106339468303e-06, "objective/entropy": 170.59136962890625, "objective/kl": 15.129783630371094, "objective/non_score_reward": -0.7564891576766968, "objective/rlhf_reward": -1.469697265830591, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 135.97763061523438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.72265625, "step": 28, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9975056648254395 }, { "episode": 480, "epoch": 0.002875939173886472, "loss/policy_avg": 0.03272615000605583, "lr": 9.9814672801636e-06, "objective/entropy": 6.700323104858398, "objective/kl": 10.701581954956055, "objective/non_score_reward": -0.5350791215896606, "objective/rlhf_reward": -0.6897181971982564, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 63.513145446777344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.60546875, "step": 29, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998331069946289 }, { "episode": 496, "epoch": 0.0029718038130160216, "loss/policy_avg": 0.07188314199447632, "lr": 9.980828220858897e-06, "objective/entropy": -47.331199645996094, "objective/kl": 12.874979019165039, "objective/non_score_reward": -0.6437489986419678, "objective/rlhf_reward": -1.1963937664903224, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 77.876220703125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5390625, "step": 30, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967491626739502 }, { "episode": 512, "epoch": 0.0030676684521455705, "loss/policy_avg": 0.04047826677560806, "lr": 9.980189161554194e-06, "objective/entropy": 282.3853759765625, "objective/kl": 9.654375076293945, "objective/non_score_reward": -0.4827187657356262, "objective/rlhf_reward": -0.5716251668676566, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 64.11791229248047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.89453125, "step": 31, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9997191429138184 }, { "episode": 528, "epoch": 0.0031635330912751195, "loss/policy_avg": 0.07097287476062775, "lr": 9.97955010224949e-06, "objective/entropy": 116.042236328125, "objective/kl": 14.595599174499512, "objective/non_score_reward": -0.7297799587249756, "objective/rlhf_reward": -0.7964137478926516, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 272.6925048828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3857421875, "step": 32, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0027289390563965 }, { "episode": 544, "epoch": 0.0032593977304046684, "loss/policy_avg": 0.5246497392654419, "lr": 9.978911042944786e-06, "objective/entropy": 8.318304061889648, "objective/kl": 16.622827529907227, "objective/non_score_reward": -0.831141471862793, "objective/rlhf_reward": -1.9990529752074906, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 159.0550079345703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.703125, "step": 33, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971305131912231 }, { "episode": 560, "epoch": 0.003355262369534218, "loss/policy_avg": 0.20073390007019043, "lr": 9.978271983640083e-06, "objective/entropy": 92.97464752197266, "objective/kl": 10.66767692565918, "objective/non_score_reward": -0.5333837866783142, "objective/rlhf_reward": 2.2664648383855823, "objective/scores": 1.1, "policy/approxkl_avg": 89.14144134521484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.544921875, "step": 34, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000143051147461 }, { "episode": 576, "epoch": 0.0034511270086637668, "loss/policy_avg": 0.04765152558684349, "lr": 9.977632924335378e-06, "objective/entropy": 149.43089294433594, "objective/kl": 16.67333221435547, "objective/non_score_reward": -0.8336665630340576, "objective/rlhf_reward": -0.9346663713455201, "objective/scores": 0.6, "policy/approxkl_avg": 189.3590850830078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4765625, "step": 35, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9986698627471924 }, { "episode": 592, "epoch": 0.0035469916477933157, "loss/policy_avg": 0.40008074045181274, "lr": 9.976993865030675e-06, "objective/entropy": 157.10501098632812, "objective/kl": 13.927867889404297, "objective/non_score_reward": -0.6963933706283569, "objective/rlhf_reward": -1.406971328941685, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 121.78231811523438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.70703125, "step": 36, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974275827407837 }, { "episode": 608, "epoch": 0.003642856286922865, "loss/policy_avg": 0.08663024008274078, "lr": 9.976354805725972e-06, "objective/entropy": 47.76446533203125, "objective/kl": 13.560833930969238, "objective/non_score_reward": -0.6780416965484619, "objective/rlhf_reward": -0.5894605539002753, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 43.71810531616211, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5078125, "step": 37, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991490840911865 }, { "episode": 624, "epoch": 0.003738720926052414, "loss/policy_avg": 0.08268876373767853, "lr": 9.975715746421269e-06, "objective/entropy": 192.41729736328125, "objective/kl": 6.687016010284424, "objective/non_score_reward": -0.3343508243560791, "objective/rlhf_reward": 0.021846643354015427, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 67.82701873779297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.619140625, "step": 38, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999939203262329 }, { "episode": 640, "epoch": 0.003834585565181963, "loss/policy_avg": 0.05995899811387062, "lr": 9.975076687116566e-06, "objective/entropy": -98.350341796875, "objective/kl": 9.015666961669922, "objective/non_score_reward": -0.450783371925354, "objective/rlhf_reward": 0.14427768908268623, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 51.733055114746094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 39, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974713325500488 }, { "episode": 656, "epoch": 0.003930450204311512, "loss/policy_avg": 0.18854951858520508, "lr": 9.97443762781186e-06, "objective/entropy": 141.67947387695312, "objective/kl": 10.309185028076172, "objective/non_score_reward": -0.5154592990875244, "objective/rlhf_reward": -0.6618371069431306, "objective/scores": 0.35, "policy/approxkl_avg": 71.02857208251953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.744140625, "step": 40, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993364810943604 }, { "episode": 672, "epoch": 0.004026314843441061, "loss/policy_avg": 0.05062849074602127, "lr": 9.973798568507158e-06, "objective/entropy": -38.6858024597168, "objective/kl": 9.445882797241211, "objective/non_score_reward": -0.4722941517829895, "objective/rlhf_reward": -1.8891766667366028, "objective/scores": 0.0, "policy/approxkl_avg": 5.4856438636779785, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62890625, "step": 41, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984209537506104 }, { "episode": 688, "epoch": 0.00412217948257061, "loss/policy_avg": 0.09501229226589203, "lr": 9.973159509202454e-06, "objective/entropy": 17.35771942138672, "objective/kl": 10.873266220092773, "objective/non_score_reward": -0.5436632633209229, "objective/rlhf_reward": -0.44131985406080876, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 98.38662719726562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6953125, "step": 42, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995697736740112 }, { "episode": 704, "epoch": 0.004218044121700159, "loss/policy_avg": 0.32498252391815186, "lr": 9.972520449897751e-06, "objective/entropy": 174.98866271972656, "objective/kl": 11.279447555541992, "objective/non_score_reward": -0.5639723539352417, "objective/rlhf_reward": -0.7749369321421384, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 62.73210144042969, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.552734375, "step": 43, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0007286071777344 }, { "episode": 720, "epoch": 0.004313908760829708, "loss/policy_avg": 0.3995896577835083, "lr": 9.971881390593048e-06, "objective/entropy": 36.609832763671875, "objective/kl": 19.769756317138672, "objective/non_score_reward": -0.9884878993034363, "objective/rlhf_reward": -2.1291227295723667, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 164.33892822265625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.708984375, "step": 44, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9945415258407593 }, { "episode": 736, "epoch": 0.004409773399959257, "loss/policy_avg": 0.17710548639297485, "lr": 9.971242331288345e-06, "objective/entropy": 93.23808288574219, "objective/kl": 16.88797378540039, "objective/non_score_reward": -0.8443987965583801, "objective/rlhf_reward": -1.7157356492882831, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 54.64923858642578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.779296875, "step": 45, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981857538223267 }, { "episode": 752, "epoch": 0.004505638039088807, "loss/policy_avg": 0.32767364382743835, "lr": 9.97060327198364e-06, "objective/entropy": 202.11843872070312, "objective/kl": 14.050471305847168, "objective/non_score_reward": -0.7025235295295715, "objective/rlhf_reward": -1.484581295281572, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 76.14016723632812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 46, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997007846832275 }, { "episode": 768, "epoch": 0.004601502678218356, "loss/policy_avg": 0.08174459636211395, "lr": 9.969964212678937e-06, "objective/entropy": 54.37752151489258, "objective/kl": 15.1139497756958, "objective/non_score_reward": -0.75569748878479, "objective/rlhf_reward": -1.6635400888666343, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 83.4612045288086, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4296875, "step": 47, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9972434043884277 }, { "episode": 784, "epoch": 0.004697367317347905, "loss/policy_avg": 0.03365965187549591, "lr": 9.969325153374234e-06, "objective/entropy": 85.39935302734375, "objective/kl": 13.452342987060547, "objective/non_score_reward": -0.6726170778274536, "objective/rlhf_reward": -0.74305723138326, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 61.629390716552734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.572265625, "step": 48, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998470544815063 }, { "episode": 800, "epoch": 0.004793231956477454, "loss/policy_avg": 0.009335246868431568, "lr": 9.968686094069531e-06, "objective/entropy": 288.22564697265625, "objective/kl": 19.127742767333984, "objective/non_score_reward": -0.9563871026039124, "objective/rlhf_reward": -0.9018295153391089, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 176.43731689453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.892578125, "step": 49, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9936624765396118 }, { "episode": 816, "epoch": 0.004889096595607003, "loss/policy_avg": 0.13336139917373657, "lr": 9.968047034764828e-06, "objective/entropy": -38.686851501464844, "objective/kl": 18.06523895263672, "objective/non_score_reward": -0.9032620191574097, "objective/rlhf_reward": -2.1320952503041024, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 179.73486328125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65625, "step": 50, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996016263961792 }, { "episode": 832, "epoch": 0.004984961234736552, "loss/policy_avg": 0.09758515655994415, "lr": 9.967407975460123e-06, "objective/entropy": -32.55284881591797, "objective/kl": 10.72513198852539, "objective/non_score_reward": -0.5362565517425537, "objective/rlhf_reward": -0.721194286544887, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 44.48727798461914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.736328125, "step": 51, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976041316986084 }, { "episode": 848, "epoch": 0.005080825873866101, "loss/policy_avg": 0.5202991366386414, "lr": 9.96676891615542e-06, "objective/entropy": 45.2802734375, "objective/kl": 16.129152297973633, "objective/non_score_reward": -0.8064576387405396, "objective/rlhf_reward": -1.2784193260239918, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 124.33740234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.623046875, "step": 52, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978928565979004 }, { "episode": 864, "epoch": 0.00517669051299565, "loss/policy_avg": 0.28677505254745483, "lr": 9.966129856850717e-06, "objective/entropy": -76.81179809570312, "objective/kl": 15.223251342773438, "objective/non_score_reward": -0.761162519454956, "objective/rlhf_reward": -1.5288782207094989, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 69.77767944335938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7890625, "step": 53, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999229907989502 }, { "episode": 880, "epoch": 0.0052725551521251995, "loss/policy_avg": 0.20859162509441376, "lr": 9.965490797546014e-06, "objective/entropy": -21.344478607177734, "objective/kl": 10.70494556427002, "objective/non_score_reward": -0.535247266292572, "objective/rlhf_reward": -0.7623869264997064, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 98.75808715820312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.56640625, "step": 54, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975996017456055 }, { "episode": 896, "epoch": 0.0053684197912547485, "loss/policy_avg": 1.2579694986343384, "lr": 9.96485173824131e-06, "objective/entropy": 164.7299346923828, "objective/kl": 18.096805572509766, "objective/non_score_reward": -0.9048402309417725, "objective/rlhf_reward": -2.0152409709134873, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 95.78445434570312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.736328125, "step": 55, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9966285228729248 }, { "episode": 912, "epoch": 0.0054642844303842975, "loss/policy_avg": 0.3564913868904114, "lr": 9.964212678936606e-06, "objective/entropy": 85.46858215332031, "objective/kl": 17.930484771728516, "objective/non_score_reward": -0.89652419090271, "objective/rlhf_reward": -1.4633905313172677, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 79.41477966308594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4091796875, "step": 56, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984946250915527 }, { "episode": 928, "epoch": 0.005560149069513846, "loss/policy_avg": 0.03960660099983215, "lr": 9.963573619631903e-06, "objective/entropy": 205.954833984375, "objective/kl": 17.15917205810547, "objective/non_score_reward": -0.8579585552215576, "objective/rlhf_reward": -1.3091281972089148, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 23.591196060180664, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.791015625, "step": 57, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997645378112793 }, { "episode": 944, "epoch": 0.005656013708643395, "loss/policy_avg": -0.00983378104865551, "lr": 9.9629345603272e-06, "objective/entropy": -1.1022186279296875, "objective/kl": 16.26142692565918, "objective/non_score_reward": -0.8130713105201721, "objective/rlhf_reward": 1.1477148175239567, "objective/scores": 1.1, "policy/approxkl_avg": 81.65092468261719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.552734375, "step": 58, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.99957275390625 }, { "episode": 960, "epoch": 0.005751878347772944, "loss/policy_avg": 0.32060182094573975, "lr": 9.962295501022495e-06, "objective/entropy": 48.09014892578125, "objective/kl": 7.438636302947998, "objective/non_score_reward": -0.3719318211078644, "objective/rlhf_reward": 0.6349789739391469, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.77626895904541, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.822265625, "step": 59, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.008031129837036 }, { "episode": 976, "epoch": 0.005847742986902493, "loss/policy_avg": 0.2516993582248688, "lr": 9.961656441717792e-06, "objective/entropy": -46.64883804321289, "objective/kl": 19.601835250854492, "objective/non_score_reward": -0.9800918102264404, "objective/rlhf_reward": -2.594854134946985, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 181.5974578857422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.609375, "step": 60, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988071918487549 }, { "episode": 992, "epoch": 0.005943607626032043, "loss/policy_avg": 0.1109720841050148, "lr": 9.961017382413088e-06, "objective/entropy": 97.6422348022461, "objective/kl": 13.844486236572266, "objective/non_score_reward": -0.692224383354187, "objective/rlhf_reward": -1.2126380791335847, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 96.34603118896484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.583984375, "step": 61, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9974918365478516 }, { "episode": 1008, "epoch": 0.006039472265161592, "loss/policy_avg": -0.05115126073360443, "lr": 9.960378323108385e-06, "objective/entropy": 34.42061996459961, "objective/kl": 14.079090118408203, "objective/non_score_reward": -0.7039545774459839, "objective/rlhf_reward": -1.4565682944997977, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 49.87873840332031, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.677734375, "step": 62, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982357025146484 }, { "episode": 1024, "epoch": 0.006135336904291141, "loss/policy_avg": 0.22280101478099823, "lr": 9.959739263803682e-06, "objective/entropy": -24.89067840576172, "objective/kl": 19.501176834106445, "objective/non_score_reward": -0.9750589728355408, "objective/rlhf_reward": -2.4496376319841, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 243.47512817382812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.888671875, "step": 63, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999916911125183 }, { "episode": 1040, "epoch": 0.00623120154342069, "loss/policy_avg": 0.36840492486953735, "lr": 9.959100204498979e-06, "objective/entropy": 134.6929931640625, "objective/kl": 22.332670211791992, "objective/non_score_reward": -1.1166335344314575, "objective/rlhf_reward": -2.641705389293741, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 136.65045166015625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 64, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981276988983154 }, { "episode": 1056, "epoch": 0.006327066182550239, "loss/policy_avg": 0.09098342061042786, "lr": 9.958461145194274e-06, "objective/entropy": -26.864063262939453, "objective/kl": 13.052759170532227, "objective/non_score_reward": -0.6526379585266113, "objective/rlhf_reward": -0.7857228770580997, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 62.885929107666016, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.603515625, "step": 65, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997183084487915 }, { "episode": 1072, "epoch": 0.006422930821679788, "loss/policy_avg": 0.27086368203163147, "lr": 9.957822085889571e-06, "objective/entropy": -58.01667404174805, "objective/kl": 16.48623275756836, "objective/non_score_reward": -0.8243115544319153, "objective/rlhf_reward": -1.635386770189391, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 153.92050170898438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.583984375, "step": 66, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005505084991455 }, { "episode": 1088, "epoch": 0.006518795460809337, "loss/policy_avg": 1.2388324737548828, "lr": 9.957183026584868e-06, "objective/entropy": 99.91399383544922, "objective/kl": 21.524110794067383, "objective/non_score_reward": -1.0762056112289429, "objective/rlhf_reward": -2.6429626993542774, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 170.69760131835938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.64453125, "step": 67, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9946773052215576 }, { "episode": 1104, "epoch": 0.006614660099938887, "loss/policy_avg": 0.330521821975708, "lr": 9.956543967280165e-06, "objective/entropy": -76.99481201171875, "objective/kl": 15.58948802947998, "objective/non_score_reward": -0.7794743776321411, "objective/rlhf_reward": -1.7178976856172086, "objective/scores": 0.35, "policy/approxkl_avg": 218.45574951171875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.64453125, "step": 68, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997715950012207 }, { "episode": 1120, "epoch": 0.006710524739068436, "loss/policy_avg": 0.11920663714408875, "lr": 9.955904907975462e-06, "objective/entropy": 70.55160522460938, "objective/kl": 20.134777069091797, "objective/non_score_reward": -1.0067389011383057, "objective/rlhf_reward": -2.6853197722727353, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 62.195674896240234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.34765625, "step": 69, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001072406768799 }, { "episode": 1136, "epoch": 0.006806389378197985, "loss/policy_avg": -0.17695794999599457, "lr": 9.955265848670757e-06, "objective/entropy": 101.99272918701172, "objective/kl": 12.69788932800293, "objective/non_score_reward": -0.6348943710327148, "objective/rlhf_reward": -2.539577692747116, "objective/scores": 0.0, "policy/approxkl_avg": 64.835693359375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.44140625, "step": 70, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0115315914154053 }, { "episode": 1152, "epoch": 0.0069022540173275335, "loss/policy_avg": 0.35137245059013367, "lr": 9.954626789366054e-06, "objective/entropy": 79.80499267578125, "objective/kl": 21.120101928710938, "objective/non_score_reward": -1.0560050010681152, "objective/rlhf_reward": -2.1013141296067577, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 124.16864776611328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.68359375, "step": 71, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998917818069458 }, { "episode": 1168, "epoch": 0.0069981186564570825, "loss/policy_avg": 0.07422849535942078, "lr": 9.95398773006135e-06, "objective/entropy": 9.376724243164062, "objective/kl": 15.093628883361816, "objective/non_score_reward": -0.7546814680099487, "objective/rlhf_reward": -1.6594760653719138, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 47.567962646484375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65234375, "step": 72, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9958832263946533 }, { "episode": 1184, "epoch": 0.0070939832955866314, "loss/policy_avg": 0.11969298124313354, "lr": 9.953348670756648e-06, "objective/entropy": 133.57423400878906, "objective/kl": 20.2343807220459, "objective/non_score_reward": -1.0117191076278687, "objective/rlhf_reward": -1.1231571778070655, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 93.79672241210938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.423828125, "step": 73, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0005669593811035 }, { "episode": 1200, "epoch": 0.00718984793471618, "loss/policy_avg": 0.2395152747631073, "lr": 9.952709611451944e-06, "objective/entropy": 31.68697166442871, "objective/kl": 20.96116828918457, "objective/non_score_reward": -1.0480585098266602, "objective/rlhf_reward": -2.711281481202006, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 194.83474731445312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.669921875, "step": 74, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9953577518463135 }, { "episode": 1216, "epoch": 0.00728571257384573, "loss/policy_avg": 0.27856501936912537, "lr": 9.952070552147241e-06, "objective/entropy": 119.42091369628906, "objective/kl": 11.30095100402832, "objective/non_score_reward": -0.5650476217269897, "objective/rlhf_reward": -0.9185547738367612, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 59.14590835571289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.75, "step": 75, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9953703880310059 }, { "episode": 1232, "epoch": 0.007381577212975279, "loss/policy_avg": 0.21030786633491516, "lr": 9.951431492842536e-06, "objective/entropy": 7.310768127441406, "objective/kl": 6.645857810974121, "objective/non_score_reward": -0.3322928845882416, "objective/rlhf_reward": 0.04943063011993787, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 14.611559867858887, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.591796875, "step": 76, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996391534805298 }, { "episode": 1248, "epoch": 0.007477441852104828, "loss/policy_avg": 0.4117072820663452, "lr": 9.950792433537833e-06, "objective/entropy": -109.53082275390625, "objective/kl": 11.825650215148926, "objective/non_score_reward": -0.5912825465202332, "objective/rlhf_reward": 0.03486987352371207, "objective/scores": 0.6, "policy/approxkl_avg": 19.0810604095459, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6171875, "step": 77, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981213808059692 }, { "episode": 1264, "epoch": 0.007573306491234377, "loss/policy_avg": 0.2597622275352478, "lr": 9.950153374233129e-06, "objective/entropy": -29.7529296875, "objective/kl": 18.43012809753418, "objective/non_score_reward": -0.9215063452720642, "objective/rlhf_reward": -2.2860254704952236, "objective/scores": 0.35, "policy/approxkl_avg": 267.2847900390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.70703125, "step": 78, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997544765472412 }, { "episode": 1280, "epoch": 0.007669171130363926, "loss/policy_avg": 0.2407466471195221, "lr": 9.949514314928425e-06, "objective/entropy": 14.07373046875, "objective/kl": 20.781753540039062, "objective/non_score_reward": -1.0390876531600952, "objective/rlhf_reward": -1.2326316579591956, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 147.4822235107422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.724609375, "step": 79, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987099170684814 }, { "episode": 1296, "epoch": 0.007765035769493475, "loss/policy_avg": 0.17344285547733307, "lr": 9.948875255623722e-06, "objective/entropy": 112.44259643554688, "objective/kl": 10.0985746383667, "objective/non_score_reward": -0.504928708076477, "objective/rlhf_reward": 0.38028510808944693, "objective/scores": 0.6, "policy/approxkl_avg": 4.8866167068481445, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.443359375, "step": 80, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0143842697143555 }, { "episode": 1312, "epoch": 0.007860900408623025, "loss/policy_avg": 0.14816004037857056, "lr": 9.94823619631902e-06, "objective/entropy": 67.11033630371094, "objective/kl": 17.487518310546875, "objective/non_score_reward": -0.8743758797645569, "objective/rlhf_reward": -2.1558679251963193, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 18.69343376159668, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4619140625, "step": 81, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998113751411438 }, { "episode": 1328, "epoch": 0.007956765047752574, "loss/policy_avg": 0.2536642849445343, "lr": 9.947597137014316e-06, "objective/entropy": -71.85224914550781, "objective/kl": 11.223343849182129, "objective/non_score_reward": -0.5611672401428223, "objective/rlhf_reward": -0.7637163875654935, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 37.78028869628906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48828125, "step": 82, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0003702640533447 }, { "episode": 1344, "epoch": 0.008052629686882123, "loss/policy_avg": 0.3479039669036865, "lr": 9.946958077709611e-06, "objective/entropy": 146.41241455078125, "objective/kl": 20.458145141601562, "objective/non_score_reward": -1.0229072570800781, "objective/rlhf_reward": -2.732379042838497, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 64.28889465332031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.705078125, "step": 83, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976742267608643 }, { "episode": 1360, "epoch": 0.008148494326011672, "loss/policy_avg": 0.10525624454021454, "lr": 9.946319018404908e-06, "objective/entropy": -43.42662048339844, "objective/kl": 13.858359336853027, "objective/non_score_reward": -0.6929180026054382, "objective/rlhf_reward": -0.6489658228316642, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 61.37925720214844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48828125, "step": 84, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0012595653533936 }, { "episode": 1376, "epoch": 0.00824435896514122, "loss/policy_avg": 0.3409525156021118, "lr": 9.945679959100205e-06, "objective/entropy": 1.5508041381835938, "objective/kl": 19.05010223388672, "objective/non_score_reward": -0.9525051116943359, "objective/rlhf_reward": -2.205900583330708, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 97.6533203125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.63671875, "step": 85, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000422477722168 }, { "episode": 1392, "epoch": 0.00834022360427077, "loss/policy_avg": 0.3110717535018921, "lr": 9.945040899795502e-06, "objective/entropy": 215.75965881347656, "objective/kl": 18.800819396972656, "objective/non_score_reward": -0.9400409460067749, "objective/rlhf_reward": -2.156043860975819, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 84.93620300292969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.669921875, "step": 86, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9962902069091797 }, { "episode": 1408, "epoch": 0.008436088243400319, "loss/policy_avg": 0.02868543565273285, "lr": 9.944401840490799e-06, "objective/entropy": 154.10025024414062, "objective/kl": 13.492873191833496, "objective/non_score_reward": -0.6746436357498169, "objective/rlhf_reward": -0.9652413214246431, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 42.483882904052734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.44921875, "step": 87, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9983662366867065 }, { "episode": 1424, "epoch": 0.008531952882529868, "loss/policy_avg": 0.07607420533895493, "lr": 9.943762781186096e-06, "objective/entropy": 202.40365600585938, "objective/kl": 13.719297409057617, "objective/non_score_reward": -0.685964822769165, "objective/rlhf_reward": 1.6561407089233402, "objective/scores": 1.1, "policy/approxkl_avg": 20.57819175720215, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.728515625, "step": 88, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999366283416748 }, { "episode": 1440, "epoch": 0.008627817521659416, "loss/policy_avg": 0.16665664315223694, "lr": 9.94312372188139e-06, "objective/entropy": -100.20193481445312, "objective/kl": 15.216776847839355, "objective/non_score_reward": -0.7608388662338257, "objective/rlhf_reward": -1.4392355120817002, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 85.36731719970703, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.626953125, "step": 89, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990196228027344 }, { "episode": 1456, "epoch": 0.008723682160788965, "loss/policy_avg": 0.19817781448364258, "lr": 9.942484662576688e-06, "objective/entropy": -0.7409725189208984, "objective/kl": 10.389724731445312, "objective/non_score_reward": -0.5194862484931946, "objective/rlhf_reward": 2.3220549762248996, "objective/scores": 1.1, "policy/approxkl_avg": 12.642692565917969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.52734375, "step": 90, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989277124404907 }, { "episode": 1472, "epoch": 0.008819546799918514, "loss/policy_avg": 0.2365586757659912, "lr": 9.941845603271985e-06, "objective/entropy": 152.64306640625, "objective/kl": 21.58309555053711, "objective/non_score_reward": -1.0791547298431396, "objective/rlhf_reward": -2.9573691723093223, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 87.72661590576172, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.771484375, "step": 91, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999784231185913 }, { "episode": 1488, "epoch": 0.008915411439048063, "loss/policy_avg": 0.059907689690589905, "lr": 9.941206543967281e-06, "objective/entropy": 89.6580810546875, "objective/kl": 16.996726989746094, "objective/non_score_reward": -0.8498364686965942, "objective/rlhf_reward": -1.9755135669308581, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 72.40145874023438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.861328125, "step": 92, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003039836883545 }, { "episode": 1504, "epoch": 0.009011276078177614, "loss/policy_avg": 0.14265713095664978, "lr": 9.940567484662578e-06, "objective/entropy": -33.708492279052734, "objective/kl": 15.94516372680664, "objective/non_score_reward": -0.797258198261261, "objective/rlhf_reward": -0.2653137638580527, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 78.95989990234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53125, "step": 93, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997810959815979 }, { "episode": 1520, "epoch": 0.009107140717307163, "loss/policy_avg": -0.018713245168328285, "lr": 9.939928425357874e-06, "objective/entropy": -3.091245651245117, "objective/kl": 14.482427597045898, "objective/non_score_reward": -0.7241213917732239, "objective/rlhf_reward": -1.2346261046534641, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 56.76847839355469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.501953125, "step": 94, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993091821670532 }, { "episode": 1536, "epoch": 0.009203005356436712, "loss/policy_avg": -0.0069353943690657616, "lr": 9.93928936605317e-06, "objective/entropy": 95.46006774902344, "objective/kl": 20.928672790527344, "objective/non_score_reward": -1.046433687210083, "objective/rlhf_reward": -2.360906060012888, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 103.58160400390625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 95, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974098205566406 }, { "episode": 1552, "epoch": 0.009298869995566261, "loss/policy_avg": 0.0523187518119812, "lr": 9.938650306748467e-06, "objective/entropy": 16.342994689941406, "objective/kl": 20.205509185791016, "objective/non_score_reward": -1.0102753639221191, "objective/rlhf_reward": 0.35889836549758947, "objective/scores": 1.1, "policy/approxkl_avg": 84.55277252197266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4697265625, "step": 96, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000453233718872 }, { "episode": 1568, "epoch": 0.00939473463469581, "loss/policy_avg": 0.18428044021129608, "lr": 9.938011247443764e-06, "objective/entropy": -31.386062622070312, "objective/kl": 19.641075134277344, "objective/non_score_reward": -0.9820537567138672, "objective/rlhf_reward": -1.8055088541665412, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 92.56884002685547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59765625, "step": 97, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0001230239868164 }, { "episode": 1584, "epoch": 0.009490599273825359, "loss/policy_avg": -0.11768925935029984, "lr": 9.937372188139061e-06, "objective/entropy": -29.0854434967041, "objective/kl": 16.647226333618164, "objective/non_score_reward": -0.8323614001274109, "objective/rlhf_reward": -1.9701957342371177, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.0866272449493408, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.541015625, "step": 98, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0018460750579834 }, { "episode": 1600, "epoch": 0.009586463912954908, "loss/policy_avg": 0.06727765500545502, "lr": 9.936733128834358e-06, "objective/entropy": 96.53413391113281, "objective/kl": 21.015684127807617, "objective/non_score_reward": -1.0507843494415283, "objective/rlhf_reward": -2.8031371593475343, "objective/scores": 0.35, "policy/approxkl_avg": 36.56340026855469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646484375, "step": 99, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9949061870574951 }, { "episode": 1616, "epoch": 0.009682328552084457, "loss/policy_avg": 0.28386813402175903, "lr": 9.936094069529653e-06, "objective/entropy": 33.901954650878906, "objective/kl": 19.533782958984375, "objective/non_score_reward": -0.9766892194747925, "objective/rlhf_reward": -2.425804230387568, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 162.0339813232422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985501766204834 }, { "episode": 1632, "epoch": 0.009778193191214006, "loss/policy_avg": 0.11220409721136093, "lr": 9.93545501022495e-06, "objective/entropy": -3.93096923828125, "objective/kl": 22.981700897216797, "objective/non_score_reward": -1.1490850448608398, "objective/rlhf_reward": -3.1725080504017744, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 46.0514030456543, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6328125, "step": 101, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0070252418518066 }, { "episode": 1648, "epoch": 0.009874057830343555, "loss/policy_avg": 0.20420242846012115, "lr": 9.934815950920245e-06, "objective/entropy": 198.98751831054688, "objective/kl": 17.92270278930664, "objective/non_score_reward": -0.8961352109909058, "objective/rlhf_reward": -1.759712155136179, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 55.74137878417969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65234375, "step": 102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980387687683105 }, { "episode": 1664, "epoch": 0.009969922469473104, "loss/policy_avg": 0.27041423320770264, "lr": 9.934176891615542e-06, "objective/entropy": 1.5637626647949219, "objective/kl": 12.633028030395508, "objective/non_score_reward": -0.6316514015197754, "objective/rlhf_reward": -0.7017769768563022, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 13.92137622833252, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4208984375, "step": 103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987752437591553 }, { "episode": 1680, "epoch": 0.010065787108602653, "loss/policy_avg": 0.318324476480484, "lr": 9.933537832310839e-06, "objective/entropy": 218.76858520507812, "objective/kl": 21.40100860595703, "objective/non_score_reward": -1.0700504779815674, "objective/rlhf_reward": -2.9385662584597165, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 90.99249267578125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.677734375, "step": 104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998801827430725 }, { "episode": 1696, "epoch": 0.010161651747732202, "loss/policy_avg": 0.3075984716415405, "lr": 9.932898773006136e-06, "objective/entropy": -56.81090545654297, "objective/kl": 10.457717895507812, "objective/non_score_reward": -0.5228859186172485, "objective/rlhf_reward": -0.7129414687431871, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 48.63943862915039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.995776653289795 }, { "episode": 1712, "epoch": 0.01025751638686175, "loss/policy_avg": 0.5551585555076599, "lr": 9.932259713701433e-06, "objective/entropy": -48.12900924682617, "objective/kl": 21.915470123291016, "objective/non_score_reward": -1.0957735776901245, "objective/rlhf_reward": -1.459375207067701, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 33.369083404541016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.71484375, "step": 106, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.995157241821289 }, { "episode": 1728, "epoch": 0.0103533810259913, "loss/policy_avg": 0.252463161945343, "lr": 9.931620654396728e-06, "objective/entropy": -69.64755249023438, "objective/kl": 15.248108863830566, "objective/non_score_reward": -0.7624054551124573, "objective/rlhf_reward": -1.707986166983276, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 59.05755615234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7578125, "step": 107, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9963257312774658 }, { "episode": 1744, "epoch": 0.01044924566512085, "loss/policy_avg": 0.13919854164123535, "lr": 9.930981595092025e-06, "objective/entropy": -133.55258178710938, "objective/kl": 17.2213134765625, "objective/non_score_reward": -0.8610656261444092, "objective/rlhf_reward": -2.0850126979097556, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 32.41887664794922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5234375, "step": 108, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992578029632568 }, { "episode": 1760, "epoch": 0.010545110304250399, "loss/policy_avg": 0.5300755500793457, "lr": 9.930342535787322e-06, "objective/entropy": -9.471179962158203, "objective/kl": 18.607471466064453, "objective/non_score_reward": -0.9303736090660095, "objective/rlhf_reward": -2.3214945554733273, "objective/scores": 0.35, "policy/approxkl_avg": 31.75185203552246, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.654296875, "step": 109, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994070529937744 }, { "episode": 1776, "epoch": 0.010640974943379948, "loss/policy_avg": 0.17107412219047546, "lr": 9.929703476482619e-06, "objective/entropy": 72.44110107421875, "objective/kl": 16.862125396728516, "objective/non_score_reward": -0.8431062698364258, "objective/rlhf_reward": -3.372425138950348, "objective/scores": 0.0, "policy/approxkl_avg": 66.22834777832031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.70703125, "step": 110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.995293378829956 }, { "episode": 1792, "epoch": 0.010736839582509497, "loss/policy_avg": -0.11443672329187393, "lr": 9.929064417177915e-06, "objective/entropy": 80.82670593261719, "objective/kl": 18.79993438720703, "objective/non_score_reward": -0.9399967789649963, "objective/rlhf_reward": -2.336154927213756, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 31.270248413085938, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5625, "step": 111, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.007622241973877 }, { "episode": 1808, "epoch": 0.010832704221639046, "loss/policy_avg": 0.0878123939037323, "lr": 9.928425357873212e-06, "objective/entropy": -118.92440795898438, "objective/kl": 17.83495330810547, "objective/non_score_reward": -0.8917477130889893, "objective/rlhf_reward": -2.2253551392847593, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 20.88257598876953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 112, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.996976613998413 }, { "episode": 1824, "epoch": 0.010928568860768595, "loss/policy_avg": 0.18364591896533966, "lr": 9.927786298568507e-06, "objective/entropy": 8.144821166992188, "objective/kl": 14.821235656738281, "objective/non_score_reward": -0.741061806678772, "objective/rlhf_reward": -1.2309138337771097, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 17.778968811035156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 113, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000706911087036 }, { "episode": 1840, "epoch": 0.011024433499898144, "loss/policy_avg": 0.06979192793369293, "lr": 9.927147239263804e-06, "objective/entropy": -2.9724502563476562, "objective/kl": 17.076000213623047, "objective/non_score_reward": -0.8538000583648682, "objective/rlhf_reward": -1.8994284508549533, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 46.98078918457031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.798828125, "step": 114, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999396800994873 }, { "episode": 1856, "epoch": 0.011120298139027693, "loss/policy_avg": 0.27465301752090454, "lr": 9.926508179959101e-06, "objective/entropy": 40.056610107421875, "objective/kl": 22.515907287597656, "objective/non_score_reward": -1.1257953643798828, "objective/rlhf_reward": -2.8413221291905506, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 81.93817138671875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0007479190826416 }, { "episode": 1872, "epoch": 0.011216162778157242, "loss/policy_avg": 0.3945024013519287, "lr": 9.925869120654398e-06, "objective/entropy": 69.15873718261719, "objective/kl": 21.74050521850586, "objective/non_score_reward": -1.0870254039764404, "objective/rlhf_reward": -3.0225888824760148, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 38.46895980834961, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59765625, "step": 116, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0014419555664062 }, { "episode": 1888, "epoch": 0.01131202741728679, "loss/policy_avg": 0.5689772367477417, "lr": 9.925230061349695e-06, "objective/entropy": 144.26678466796875, "objective/kl": 14.530990600585938, "objective/non_score_reward": -0.726549506187439, "objective/rlhf_reward": -1.1728648702303568, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.715579628944397, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8203125, "step": 117, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0058815479278564 }, { "episode": 1904, "epoch": 0.01140789205641634, "loss/policy_avg": -0.025625256821513176, "lr": 9.92459100204499e-06, "objective/entropy": -91.6683120727539, "objective/kl": 16.61312484741211, "objective/non_score_reward": -0.8306561708450317, "objective/rlhf_reward": -1.944022663918835, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 18.064186096191406, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4990234375, "step": 118, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999868392944336 }, { "episode": 1920, "epoch": 0.011503756695545889, "loss/policy_avg": 0.4135175943374634, "lr": 9.923951942740287e-06, "objective/entropy": 145.33905029296875, "objective/kl": 18.559207916259766, "objective/non_score_reward": -0.9279603958129883, "objective/rlhf_reward": -1.5891353509583808, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 19.033662796020508, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 119, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981472492218018 }, { "episode": 1936, "epoch": 0.011599621334675438, "loss/policy_avg": 0.3322446942329407, "lr": 9.923312883435584e-06, "objective/entropy": 109.6761474609375, "objective/kl": 18.231651306152344, "objective/non_score_reward": -0.9115825891494751, "objective/rlhf_reward": -1.2463304907083512, "objective/scores": 0.6, "policy/approxkl_avg": 108.51126098632812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.65625, "step": 120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996593952178955 }, { "episode": 1952, "epoch": 0.011695485973804987, "loss/policy_avg": 0.22522342205047607, "lr": 9.92267382413088e-06, "objective/entropy": 95.46246337890625, "objective/kl": 16.838998794555664, "objective/non_score_reward": -0.841949999332428, "objective/rlhf_reward": -1.8520282743298375, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 14.038084983825684, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8046875, "step": 121, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997774362564087 }, { "episode": 1968, "epoch": 0.011791350612934537, "loss/policy_avg": 0.18379229307174683, "lr": 9.922034764826178e-06, "objective/entropy": 138.12388610839844, "objective/kl": 25.93743324279785, "objective/non_score_reward": -1.2968716621398926, "objective/rlhf_reward": -3.828236812089367, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 26.206398010253906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.734375, "step": 122, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0011606216430664 }, { "episode": 1984, "epoch": 0.011887215252064086, "loss/policy_avg": 0.31653979420661926, "lr": 9.921395705521473e-06, "objective/entropy": -44.61676788330078, "objective/kl": 21.166324615478516, "objective/non_score_reward": -1.0583162307739258, "objective/rlhf_reward": -2.9077520704566666, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 29.74887466430664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.521484375, "step": 123, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996273517608643 }, { "episode": 2000, "epoch": 0.011983079891193635, "loss/policy_avg": 0.1589316874742508, "lr": 9.92075664621677e-06, "objective/entropy": -77.4912109375, "objective/kl": 20.79126739501953, "objective/non_score_reward": -1.0395634174346924, "objective/rlhf_reward": -2.4249199191729227, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 133.58343505859375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.66015625, "step": 124, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9961724281311035 }, { "episode": 2016, "epoch": 0.012078944530323184, "loss/policy_avg": 0.2586688995361328, "lr": 9.920117586912067e-06, "objective/entropy": 139.38818359375, "objective/kl": 21.455245971679688, "objective/non_score_reward": -1.072762370109558, "objective/rlhf_reward": -2.775277876647648, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 47.609947204589844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8125, "step": 125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975237846374512 }, { "episode": 2032, "epoch": 0.012174809169452733, "loss/policy_avg": 0.16066747903823853, "lr": 9.919478527607362e-06, "objective/entropy": 72.43231201171875, "objective/kl": 20.59688377380371, "objective/non_score_reward": -1.0298442840576172, "objective/rlhf_reward": 0.28062304258346593, "objective/scores": 1.1, "policy/approxkl_avg": 75.74966430664062, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.529296875, "step": 126, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998389720916748 }, { "episode": 2048, "epoch": 0.012270673808582282, "loss/policy_avg": 0.07932023704051971, "lr": 9.918839468302659e-06, "objective/entropy": -12.7745361328125, "objective/kl": 20.53061294555664, "objective/non_score_reward": -1.0265307426452637, "objective/rlhf_reward": -2.7275206232942164, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 19.110069274902344, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.55859375, "step": 127, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984248876571655 }, { "episode": 2064, "epoch": 0.012366538447711831, "loss/policy_avg": 0.27331969141960144, "lr": 9.918200408997956e-06, "objective/entropy": 101.82013702392578, "objective/kl": 18.18286895751953, "objective/non_score_reward": -0.9091434478759766, "objective/rlhf_reward": -2.2579716230310023, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 6.703115463256836, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.556640625, "step": 128, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009617805480957 }, { "episode": 2080, "epoch": 0.01246240308684138, "loss/policy_avg": 0.4916057586669922, "lr": 9.917561349693252e-06, "objective/entropy": 88.1321029663086, "objective/kl": 23.30657958984375, "objective/non_score_reward": -1.165329098701477, "objective/rlhf_reward": -3.3020663795217704, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 142.93795776367188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 129, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967398643493652 }, { "episode": 2096, "epoch": 0.012558267725970929, "loss/policy_avg": 0.16071423888206482, "lr": 9.91692229038855e-06, "objective/entropy": 136.1899871826172, "objective/kl": 15.380975723266602, "objective/non_score_reward": -0.769048810005188, "objective/rlhf_reward": -0.6761951804161073, "objective/scores": 0.6, "policy/approxkl_avg": 28.551767349243164, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.56640625, "step": 130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.03311824798584 }, { "episode": 2112, "epoch": 0.012654132365100478, "loss/policy_avg": 0.0021135974675416946, "lr": 9.916283231083844e-06, "objective/entropy": -71.15084838867188, "objective/kl": 18.961715698242188, "objective/non_score_reward": -0.9480857849121094, "objective/rlhf_reward": -2.1304838709241016, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.844127893447876, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4833984375, "step": 131, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009913444519043 }, { "episode": 2128, "epoch": 0.012749997004230027, "loss/policy_avg": 0.042635850608348846, "lr": 9.915644171779141e-06, "objective/entropy": 20.673603057861328, "objective/kl": 15.986173629760742, "objective/non_score_reward": -0.7993086576461792, "objective/rlhf_reward": -1.8555989473158414, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 36.049034118652344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.67578125, "step": 132, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998380422592163 }, { "episode": 2144, "epoch": 0.012845861643359576, "loss/policy_avg": 0.46513473987579346, "lr": 9.915005112474438e-06, "objective/entropy": 5.5274505615234375, "objective/kl": 19.590290069580078, "objective/non_score_reward": -0.979514479637146, "objective/rlhf_reward": -2.5394558692849696, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 12.074180603027344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.642578125, "step": 133, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0011277198791504 }, { "episode": 2160, "epoch": 0.012941726282489125, "loss/policy_avg": 0.245748370885849, "lr": 9.914366053169735e-06, "objective/entropy": 65.60797119140625, "objective/kl": 19.637710571289062, "objective/non_score_reward": -0.9818854928016663, "objective/rlhf_reward": -1.980130786971982, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 50.17578125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.791015625, "step": 134, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983665943145752 }, { "episode": 2176, "epoch": 0.013037590921618674, "loss/policy_avg": 0.02180427499115467, "lr": 9.913726993865032e-06, "objective/entropy": 0.8936500549316406, "objective/kl": 24.33076286315918, "objective/non_score_reward": -1.2165381908416748, "objective/rlhf_reward": -3.524517109900146, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 69.30375671386719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5009765625, "step": 135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.99745774269104 }, { "episode": 2192, "epoch": 0.013133455560748224, "loss/policy_avg": 0.36717042326927185, "lr": 9.913087934560329e-06, "objective/entropy": 83.415283203125, "objective/kl": 21.930896759033203, "objective/non_score_reward": -1.0965447425842285, "objective/rlhf_reward": -1.4624603136789527, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 79.15277862548828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.546875, "step": 136, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998199701309204 }, { "episode": 2208, "epoch": 0.013229320199877773, "loss/policy_avg": 0.2460360825061798, "lr": 9.912448875255624e-06, "objective/entropy": 137.11976623535156, "objective/kl": 21.218502044677734, "objective/non_score_reward": -1.060925006866455, "objective/rlhf_reward": -2.8198681666451373, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 67.851806640625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.666015625, "step": 137, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9969704151153564 }, { "episode": 2224, "epoch": 0.013325184839007322, "loss/policy_avg": 0.21244561672210693, "lr": 9.911809815950921e-06, "objective/entropy": 175.0180206298828, "objective/kl": 16.889467239379883, "objective/non_score_reward": -0.8444733619689941, "objective/rlhf_reward": -1.4304821593331654, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 78.4537353515625, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.515625, "step": 138, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9985227584838867 }, { "episode": 2240, "epoch": 0.013421049478136871, "loss/policy_avg": 0.18417471647262573, "lr": 9.911170756646218e-06, "objective/entropy": 224.734619140625, "objective/kl": 33.112342834472656, "objective/non_score_reward": -1.6556169986724854, "objective/rlhf_reward": -4.889135018984477, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 160.8165283203125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7109375, "step": 139, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992772340774536 }, { "episode": 2256, "epoch": 0.01351691411726642, "loss/policy_avg": 0.40639203786849976, "lr": 9.910531697341515e-06, "objective/entropy": 69.94343566894531, "objective/kl": 24.266616821289062, "objective/non_score_reward": -1.2133309841156006, "objective/rlhf_reward": -3.40272543868576, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 126.5036392211914, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5625, "step": 140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999225378036499 }, { "episode": 2272, "epoch": 0.01361277875639597, "loss/policy_avg": 0.28501349687576294, "lr": 9.909892638036812e-06, "objective/entropy": 61.523101806640625, "objective/kl": 17.776689529418945, "objective/non_score_reward": -0.8888344764709473, "objective/rlhf_reward": -1.8220045725504557, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 87.0567398071289, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.537109375, "step": 141, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000370740890503 }, { "episode": 2288, "epoch": 0.013708643395525518, "loss/policy_avg": 0.30668091773986816, "lr": 9.909253578732107e-06, "objective/entropy": 227.46041870117188, "objective/kl": 20.17832374572754, "objective/non_score_reward": -1.0089161396026611, "objective/rlhf_reward": -2.5198930142247047, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 50.498268127441406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.685546875, "step": 142, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999216079711914 }, { "episode": 2304, "epoch": 0.013804508034655067, "loss/policy_avg": 0.3348355293273926, "lr": 9.908614519427404e-06, "objective/entropy": 164.50863647460938, "objective/kl": 13.646249771118164, "objective/non_score_reward": -0.6823124885559082, "objective/rlhf_reward": -1.1251298821607407, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 63.31299591064453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.95703125, "step": 143, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986834526062012 }, { "episode": 2320, "epoch": 0.013900372673784616, "loss/policy_avg": 0.7517778277397156, "lr": 9.9079754601227e-06, "objective/entropy": -69.42684936523438, "objective/kl": 13.007519721984863, "objective/non_score_reward": -0.6503760814666748, "objective/rlhf_reward": -0.2015041172504426, "objective/scores": 0.6, "policy/approxkl_avg": 15.501136779785156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.533203125, "step": 144, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9969980716705322 }, { "episode": 2336, "epoch": 0.013996237312914165, "loss/policy_avg": 0.1666509509086609, "lr": 9.907336400817996e-06, "objective/entropy": 175.3941192626953, "objective/kl": 20.383106231689453, "objective/non_score_reward": -1.0191553831100464, "objective/rlhf_reward": -2.414762055099593, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 102.40309143066406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65625, "step": 145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9960074424743652 }, { "episode": 2352, "epoch": 0.014092101952043714, "loss/policy_avg": 0.08111919462680817, "lr": 9.906697341513293e-06, "objective/entropy": 66.45804595947266, "objective/kl": 20.63641357421875, "objective/non_score_reward": -1.0318206548690796, "objective/rlhf_reward": -2.7680326637968253, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 16.144962310791016, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.44921875, "step": 146, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003509521484375 }, { "episode": 2368, "epoch": 0.014187966591173263, "loss/policy_avg": 0.2162848860025406, "lr": 9.90605828220859e-06, "objective/entropy": 66.34003448486328, "objective/kl": 21.03724479675293, "objective/non_score_reward": -1.051862359046936, "objective/rlhf_reward": -1.8074494361877442, "objective/scores": 0.6, "policy/approxkl_avg": 56.59767150878906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.544921875, "step": 147, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9967288970947266 }, { "episode": 2384, "epoch": 0.014283831230302812, "loss/policy_avg": 0.13452857732772827, "lr": 9.905419222903886e-06, "objective/entropy": 160.91929626464844, "objective/kl": 22.133365631103516, "objective/non_score_reward": -1.10666823387146, "objective/rlhf_reward": -2.693339631954829, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 64.49358367919922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62109375, "step": 148, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9988274574279785 }, { "episode": 2400, "epoch": 0.01437969586943236, "loss/policy_avg": 1.6826289892196655, "lr": 9.904780163599183e-06, "objective/entropy": -182.28018188476562, "objective/kl": 22.543842315673828, "objective/non_score_reward": -1.1271920204162598, "objective/rlhf_reward": -3.084936280449001, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 70.59880828857422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.62890625, "step": 149, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0008223056793213 }, { "episode": 2416, "epoch": 0.01447556050856191, "loss/policy_avg": 0.4059183597564697, "lr": 9.904141104294478e-06, "objective/entropy": 225.73135375976562, "objective/kl": 23.115840911865234, "objective/non_score_reward": -1.1557921171188354, "objective/rlhf_reward": -2.8898351351420084, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 45.14168930053711, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.703125, "step": 150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9997532367706299 }, { "episode": 2432, "epoch": 0.01457142514769146, "loss/policy_avg": 0.10681919753551483, "lr": 9.903502044989775e-06, "objective/entropy": 213.69598388671875, "objective/kl": 26.178190231323242, "objective/non_score_reward": -1.3089096546173096, "objective/rlhf_reward": -3.894002726584106, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 92.52935791015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 151, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9975218772888184 }, { "episode": 2448, "epoch": 0.01466728978682101, "loss/policy_avg": -0.2853464186191559, "lr": 9.902862985685072e-06, "objective/entropy": 58.680572509765625, "objective/kl": 17.81705665588379, "objective/non_score_reward": -0.8908528089523315, "objective/rlhf_reward": -0.6396921619188514, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 89.08941650390625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.669921875, "step": 152, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0143747329711914 }, { "episode": 2464, "epoch": 0.014763154425950558, "loss/policy_avg": 0.07825072109699249, "lr": 9.902223926380369e-06, "objective/entropy": 198.86288452148438, "objective/kl": 28.436542510986328, "objective/non_score_reward": -1.4218271970748901, "objective/rlhf_reward": -2.7635896548044414, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 44.41461181640625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59375, "step": 153, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9994783401489258 }, { "episode": 2480, "epoch": 0.014859019065080107, "loss/policy_avg": 0.27155977487564087, "lr": 9.901584867075666e-06, "objective/entropy": 89.04707336425781, "objective/kl": 21.113758087158203, "objective/non_score_reward": -1.0556879043579102, "objective/rlhf_reward": -1.2990326031458106, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 58.70441818237305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.623046875, "step": 154, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9971623420715332 }, { "episode": 2496, "epoch": 0.014954883704209656, "loss/policy_avg": 0.3080964982509613, "lr": 9.900945807770961e-06, "objective/entropy": 35.38983154296875, "objective/kl": 21.02568817138672, "objective/non_score_reward": -1.0512844324111938, "objective/rlhf_reward": -2.7241851715401406, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 52.82551193237305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.560546875, "step": 155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9960044622421265 }, { "episode": 2512, "epoch": 0.015050748343339205, "loss/policy_avg": 4.562356472015381, "lr": 9.900306748466258e-06, "objective/entropy": 253.11752319335938, "objective/kl": 22.01451301574707, "objective/non_score_reward": -1.1007256507873535, "objective/rlhf_reward": -2.798782501284199, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 74.26364135742188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.765625, "step": 156, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9965643882751465 }, { "episode": 2528, "epoch": 0.015146612982468754, "loss/policy_avg": 0.21197248995304108, "lr": 9.899667689161555e-06, "objective/entropy": 149.58770751953125, "objective/kl": 23.317626953125, "objective/non_score_reward": -1.1658812761306763, "objective/rlhf_reward": -2.2635251045227047, "objective/scores": 0.6, "policy/approxkl_avg": 51.574981689453125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4736328125, "step": 157, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.995574951171875 }, { "episode": 2544, "epoch": 0.015242477621598303, "loss/policy_avg": 0.20880039036273956, "lr": 9.899028629856852e-06, "objective/entropy": -64.38532257080078, "objective/kl": 25.92443084716797, "objective/non_score_reward": -1.2962216138839722, "objective/rlhf_reward": -3.784886217117309, "objective/scores": 0.35, "policy/approxkl_avg": 138.45706176757812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.568359375, "step": 158, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9968822002410889 }, { "episode": 2560, "epoch": 0.015338342260727852, "loss/policy_avg": 0.21600359678268433, "lr": 9.898389570552149e-06, "objective/entropy": 3.545970916748047, "objective/kl": 23.09051513671875, "objective/non_score_reward": -1.1545257568359375, "objective/rlhf_reward": -2.6706922007369354, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 36.885650634765625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.55859375, "step": 159, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993218183517456 }, { "episode": 2576, "epoch": 0.015434206899857401, "loss/policy_avg": 0.5031390190124512, "lr": 9.897750511247446e-06, "objective/entropy": 98.00604248046875, "objective/kl": 25.33047866821289, "objective/non_score_reward": -1.2665239572525024, "objective/rlhf_reward": -3.4619760847726635, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 83.63774871826172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.671875, "step": 160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000274896621704 }, { "episode": 2592, "epoch": 0.01553007153898695, "loss/policy_avg": 0.018053412437438965, "lr": 9.89711145194274e-06, "objective/entropy": 2.8434524536132812, "objective/kl": 24.395084381103516, "objective/non_score_reward": -1.2197542190551758, "objective/rlhf_reward": -3.2171576074963673, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.6353378295898438, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.64453125, "step": 161, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001478433609009 }, { "episode": 2608, "epoch": 0.0156259361781165, "loss/policy_avg": 0.25576311349868774, "lr": 9.896472392638038e-06, "objective/entropy": -64.24278259277344, "objective/kl": 16.287256240844727, "objective/non_score_reward": -0.8143627643585205, "objective/rlhf_reward": -1.5241178731123606, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 25.824050903320312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6953125, "step": 162, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984066486358643 }, { "episode": 2624, "epoch": 0.01572180081724605, "loss/policy_avg": 0.2750253677368164, "lr": 9.895833333333334e-06, "objective/entropy": 170.5203857421875, "objective/kl": 35.09113693237305, "objective/non_score_reward": -1.7545567750930786, "objective/rlhf_reward": -4.094508086086485, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 91.88323974609375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.76171875, "step": 163, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9978190660476685 }, { "episode": 2640, "epoch": 0.0158176654563756, "loss/policy_avg": 0.2685161828994751, "lr": 9.895194274028631e-06, "objective/entropy": 107.911376953125, "objective/kl": 21.708637237548828, "objective/non_score_reward": -1.0854318141937256, "objective/rlhf_reward": -2.8911290570214834, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 48.546165466308594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.603515625, "step": 164, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9939230680465698 }, { "episode": 2656, "epoch": 0.015913530095505148, "loss/policy_avg": 0.3802343010902405, "lr": 9.894555214723928e-06, "objective/entropy": 137.427978515625, "objective/kl": 20.673809051513672, "objective/non_score_reward": -1.0336904525756836, "objective/rlhf_reward": -2.793125978022247, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 36.90850830078125, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.63671875, "step": 165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9987661838531494 }, { "episode": 2672, "epoch": 0.016009394734634697, "loss/policy_avg": 0.0008638650178909302, "lr": 9.893916155419225e-06, "objective/entropy": 159.45681762695312, "objective/kl": 20.339492797851562, "objective/non_score_reward": -1.016974687576294, "objective/rlhf_reward": -2.7086488542303275, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 6.459288597106934, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.515625, "step": 166, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9977226257324219 }, { "episode": 2688, "epoch": 0.016105259373764245, "loss/policy_avg": 0.3463206887245178, "lr": 9.89327709611452e-06, "objective/entropy": -75.2735824584961, "objective/kl": 27.865215301513672, "objective/non_score_reward": -1.3932607173919678, "objective/rlhf_reward": -4.173042631149292, "objective/scores": 0.35, "policy/approxkl_avg": 139.90060424804688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.587890625, "step": 167, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0016684532165527 }, { "episode": 2704, "epoch": 0.016201124012893794, "loss/policy_avg": 0.07642253488302231, "lr": 9.892638036809815e-06, "objective/entropy": 38.99913787841797, "objective/kl": 19.061498641967773, "objective/non_score_reward": -0.9530749320983887, "objective/rlhf_reward": -1.987470920356821, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.035629272460938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.484375, "step": 168, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0013604164123535 }, { "episode": 2720, "epoch": 0.016296988652023343, "loss/policy_avg": 0.2990867495536804, "lr": 9.891998977505112e-06, "objective/entropy": 199.7046661376953, "objective/kl": 23.46067237854004, "objective/non_score_reward": -1.1730337142944336, "objective/rlhf_reward": -3.268302519519893, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 19.572267532348633, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6171875, "step": 169, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998270034790039 }, { "episode": 2736, "epoch": 0.016392853291152892, "loss/policy_avg": 0.3040146231651306, "lr": 9.89135991820041e-06, "objective/entropy": 84.5781021118164, "objective/kl": 24.218996047973633, "objective/non_score_reward": -1.2109497785568237, "objective/rlhf_reward": -2.896387885289128, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 91.4429931640625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0007870197296143 }, { "episode": 2752, "epoch": 0.01648871793028244, "loss/policy_avg": 0.24132516980171204, "lr": 9.890720858895706e-06, "objective/entropy": 25.26891326904297, "objective/kl": 12.311616897583008, "objective/non_score_reward": -0.6155807971954346, "objective/rlhf_reward": -2.4623232781887054, "objective/scores": 0.0, "policy/approxkl_avg": 4.089572906494141, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 171, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984550476074219 }, { "episode": 2768, "epoch": 0.01658458256941199, "loss/policy_avg": 0.07815683633089066, "lr": 9.890081799591003e-06, "objective/entropy": -2.7739601135253906, "objective/kl": 20.480499267578125, "objective/non_score_reward": -1.0240248441696167, "objective/rlhf_reward": -2.6151468185738325, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 11.766371726989746, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.52734375, "step": 172, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999391794204712 }, { "episode": 2784, "epoch": 0.01668044720854154, "loss/policy_avg": 0.31003671884536743, "lr": 9.8894427402863e-06, "objective/entropy": -5.804538726806641, "objective/kl": 23.551572799682617, "objective/non_score_reward": -1.1775786876678467, "objective/rlhf_reward": -3.2597167297319025, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 241.19540405273438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.587890625, "step": 173, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990514516830444 }, { "episode": 2800, "epoch": 0.016776311847671088, "loss/policy_avg": 0.027285143733024597, "lr": 9.888803680981595e-06, "objective/entropy": 91.14071655273438, "objective/kl": 19.611085891723633, "objective/non_score_reward": -0.9805543422698975, "objective/rlhf_reward": -2.44126462471044, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 60.10600662231445, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.537109375, "step": 174, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972784519195557 }, { "episode": 2816, "epoch": 0.016872176486800637, "loss/policy_avg": 0.2845172882080078, "lr": 9.888164621676892e-06, "objective/entropy": 30.190153121948242, "objective/kl": 24.783939361572266, "objective/non_score_reward": -1.239197015762329, "objective/rlhf_reward": -3.578185775367123, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 76.30748748779297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.443359375, "step": 175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994440078735352 }, { "episode": 2832, "epoch": 0.016968041125930186, "loss/policy_avg": 0.5662503838539124, "lr": 9.887525562372189e-06, "objective/entropy": 60.807342529296875, "objective/kl": 12.370782852172852, "objective/non_score_reward": -0.6185390949249268, "objective/rlhf_reward": -1.0503242506581225, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 14.155126571655273, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.525390625, "step": 176, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9987331628799438 }, { "episode": 2848, "epoch": 0.017063905765059735, "loss/policy_avg": 0.08586982637643814, "lr": 9.886886503067486e-06, "objective/entropy": 43.38105010986328, "objective/kl": 24.246856689453125, "objective/non_score_reward": -1.2123429775238037, "objective/rlhf_reward": -3.470769503203732, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 141.50592041015625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.64453125, "step": 177, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9969239234924316 }, { "episode": 2864, "epoch": 0.017159770404189284, "loss/policy_avg": 0.26094895601272583, "lr": 9.886247443762783e-06, "objective/entropy": 54.85191345214844, "objective/kl": 20.912307739257812, "objective/non_score_reward": -1.0456154346466064, "objective/rlhf_reward": -2.7824616193771363, "objective/scores": 0.35, "policy/approxkl_avg": 19.43996810913086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4755859375, "step": 178, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0007808208465576 }, { "episode": 2880, "epoch": 0.017255635043318833, "loss/policy_avg": -0.0008885636925697327, "lr": 9.88560838445808e-06, "objective/entropy": 1.5364952087402344, "objective/kl": 18.547964096069336, "objective/non_score_reward": -0.9273982048034668, "objective/rlhf_reward": -1.762181530671056, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 103.84625244140625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.52734375, "step": 179, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0031652450561523 }, { "episode": 2896, "epoch": 0.017351499682448382, "loss/policy_avg": 0.07095308601856232, "lr": 9.884969325153375e-06, "objective/entropy": -57.707908630371094, "objective/kl": 17.486156463623047, "objective/non_score_reward": -0.8743079304695129, "objective/rlhf_reward": -1.3745254895844794, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 35.78956604003906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.63671875, "step": 180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995075464248657 }, { "episode": 2912, "epoch": 0.01744736432157793, "loss/policy_avg": 0.42247164249420166, "lr": 9.884330265848671e-06, "objective/entropy": 194.7113037109375, "objective/kl": 21.53358268737793, "objective/non_score_reward": -1.0766791105270386, "objective/rlhf_reward": -2.750457256045893, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 58.89783477783203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.720703125, "step": 181, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996431827545166 }, { "episode": 2928, "epoch": 0.01754322896070748, "loss/policy_avg": 0.3189627528190613, "lr": 9.883691206543968e-06, "objective/entropy": 125.43355560302734, "objective/kl": 20.729223251342773, "objective/non_score_reward": -1.0364612340927124, "objective/rlhf_reward": -2.767242708293301, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 31.974578857421875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.60546875, "step": 182, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984747171401978 }, { "episode": 2944, "epoch": 0.01763909359983703, "loss/policy_avg": 0.19416040182113647, "lr": 9.883052147239265e-06, "objective/entropy": 127.4957275390625, "objective/kl": 23.107641220092773, "objective/non_score_reward": -1.1553820371627808, "objective/rlhf_reward": -3.2429258609689295, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 41.45734786987305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.376953125, "step": 183, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999535322189331 }, { "episode": 2960, "epoch": 0.017734958238966578, "loss/policy_avg": 0.04916887357831001, "lr": 9.882413087934562e-06, "objective/entropy": -16.33904266357422, "objective/kl": 15.624849319458008, "objective/non_score_reward": -0.7812424898147583, "objective/rlhf_reward": -1.002263667360816, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 86.75860595703125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8203125, "step": 184, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9967164993286133 }, { "episode": 2976, "epoch": 0.017830822878096127, "loss/policy_avg": 0.15854808688163757, "lr": 9.881774028629857e-06, "objective/entropy": -9.968147277832031, "objective/kl": 20.46514320373535, "objective/non_score_reward": -1.0232571363449097, "objective/rlhf_reward": -2.35969527165095, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 16.395225524902344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5859375, "step": 185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976359605789185 }, { "episode": 2992, "epoch": 0.017926687517225676, "loss/policy_avg": 0.36498603224754333, "lr": 9.881134969325154e-06, "objective/entropy": 209.59991455078125, "objective/kl": 18.690290451049805, "objective/non_score_reward": -0.9345145225524902, "objective/rlhf_reward": -2.338058030605316, "objective/scores": 0.35, "policy/approxkl_avg": 12.64120101928711, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.623046875, "step": 186, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9994118213653564 }, { "episode": 3008, "epoch": 0.018022552156355228, "loss/policy_avg": 0.15073028206825256, "lr": 9.880495910020451e-06, "objective/entropy": 33.50044250488281, "objective/kl": 21.099205017089844, "objective/non_score_reward": -1.0549602508544922, "objective/rlhf_reward": 0.1801587581634525, "objective/scores": 1.1, "policy/approxkl_avg": 28.017484664916992, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.666015625, "step": 187, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000263214111328 }, { "episode": 3024, "epoch": 0.018118416795484777, "loss/policy_avg": 0.04914219304919243, "lr": 9.879856850715748e-06, "objective/entropy": 109.99685668945312, "objective/kl": 23.795440673828125, "objective/non_score_reward": -1.1897720098495483, "objective/rlhf_reward": -0.3590880990028378, "objective/scores": 1.1, "policy/approxkl_avg": 17.797225952148438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.529296875, "step": 188, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0002386569976807 }, { "episode": 3040, "epoch": 0.018214281434614326, "loss/policy_avg": 0.26782599091529846, "lr": 9.879217791411043e-06, "objective/entropy": 46.40031051635742, "objective/kl": 15.295504570007324, "objective/non_score_reward": -0.764775276184082, "objective/rlhf_reward": -1.6998512086614799, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 19.033124923706055, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4287109375, "step": 189, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0006394386291504 }, { "episode": 3056, "epoch": 0.018310146073743875, "loss/policy_avg": -0.0003484562039375305, "lr": 9.87857873210634e-06, "objective/entropy": -128.13638305664062, "objective/kl": 23.236797332763672, "objective/non_score_reward": -1.1618399620056152, "objective/rlhf_reward": -2.985500340879546, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 122.61852264404297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.521484375, "step": 190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998673439025879 }, { "episode": 3072, "epoch": 0.018406010712873424, "loss/policy_avg": 0.285878986120224, "lr": 9.877939672801637e-06, "objective/entropy": -155.79151916503906, "objective/kl": 17.15728187561035, "objective/non_score_reward": -0.8578640818595886, "objective/rlhf_reward": -1.6981231282154718, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 27.024686813354492, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.578125, "step": 191, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977209568023682 }, { "episode": 3088, "epoch": 0.018501875352002973, "loss/policy_avg": 0.03845605254173279, "lr": 9.877300613496934e-06, "objective/entropy": -79.23377227783203, "objective/kl": 24.854154586791992, "objective/non_score_reward": -1.2427077293395996, "objective/rlhf_reward": -3.4145718505054266, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 108.08650970458984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.54296875, "step": 192, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9965288639068604 }, { "episode": 3104, "epoch": 0.018597739991132522, "loss/policy_avg": 0.22054271399974823, "lr": 9.876661554192229e-06, "objective/entropy": 58.46562576293945, "objective/kl": 18.69571876525879, "objective/non_score_reward": -0.9347859621047974, "objective/rlhf_reward": -1.3391437292099, "objective/scores": 0.6, "policy/approxkl_avg": 17.535587310791016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 193, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996351957321167 }, { "episode": 3120, "epoch": 0.01869360463026207, "loss/policy_avg": 0.46004775166511536, "lr": 9.876022494887526e-06, "objective/entropy": 208.6689453125, "objective/kl": 24.537294387817383, "objective/non_score_reward": -1.2268648147583008, "objective/rlhf_reward": -3.3511998941570074, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 103.11289978027344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6171875, "step": 194, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980366230010986 }, { "episode": 3136, "epoch": 0.01878946926939162, "loss/policy_avg": 0.14284425973892212, "lr": 9.875383435582823e-06, "objective/entropy": -140.25045776367188, "objective/kl": 21.156387329101562, "objective/non_score_reward": -1.0578192472457886, "objective/rlhf_reward": -1.8312772423028945, "objective/scores": 0.6, "policy/approxkl_avg": 95.11038208007812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69921875, "step": 195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0021564960479736 }, { "episode": 3152, "epoch": 0.01888533390852117, "loss/policy_avg": 0.4036502540111542, "lr": 9.87474437627812e-06, "objective/entropy": 97.97139739990234, "objective/kl": 20.765098571777344, "objective/non_score_reward": -1.038254737854004, "objective/rlhf_reward": -1.7530193686485291, "objective/scores": 0.6, "policy/approxkl_avg": 33.61680603027344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.75, "step": 196, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9960455894470215 }, { "episode": 3168, "epoch": 0.018981198547650718, "loss/policy_avg": 0.03367016091942787, "lr": 9.874105316973416e-06, "objective/entropy": 110.7692642211914, "objective/kl": 32.466636657714844, "objective/non_score_reward": -1.6233320236206055, "objective/rlhf_reward": -4.668499465259623, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.905399322509766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66015625, "step": 197, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000126361846924 }, { "episode": 3184, "epoch": 0.019077063186780267, "loss/policy_avg": 0.3382406532764435, "lr": 9.873466257668712e-06, "objective/entropy": -46.87655258178711, "objective/kl": 23.83783531188965, "objective/non_score_reward": -1.1918917894363403, "objective/rlhf_reward": -3.44205424550168, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 26.46108055114746, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4814453125, "step": 198, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9974579811096191 }, { "episode": 3200, "epoch": 0.019172927825909816, "loss/policy_avg": 0.05052588880062103, "lr": 9.872827198364009e-06, "objective/entropy": -62.79549789428711, "objective/kl": 19.587276458740234, "objective/non_score_reward": -0.9793638586997986, "objective/rlhf_reward": -0.9937364205133643, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 37.62165069580078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.564453125, "step": 199, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9972220659255981 }, { "episode": 3216, "epoch": 0.019268792465039365, "loss/policy_avg": 0.2230260968208313, "lr": 9.872188139059305e-06, "objective/entropy": -37.75834655761719, "objective/kl": 23.102069854736328, "objective/non_score_reward": -1.1551035642623901, "objective/rlhf_reward": -3.2787786035830075, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 56.49012756347656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.583984375, "step": 200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000248908996582 }, { "episode": 3232, "epoch": 0.019364657104168913, "loss/policy_avg": 0.4118785858154297, "lr": 9.871549079754602e-06, "objective/entropy": 85.49769592285156, "objective/kl": 25.69809913635254, "objective/non_score_reward": -1.284904956817627, "objective/rlhf_reward": -3.5833605816036016, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 56.752174377441406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.66015625, "step": 201, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987632036209106 }, { "episode": 3248, "epoch": 0.019460521743298462, "loss/policy_avg": 0.06031988561153412, "lr": 9.8709100204499e-06, "objective/entropy": 16.456554412841797, "objective/kl": 25.35955047607422, "objective/non_score_reward": -1.2679774761199951, "objective/rlhf_reward": -3.6213118239358515, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 21.745624542236328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.529296875, "step": 202, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980382919311523 }, { "episode": 3264, "epoch": 0.01955638638242801, "loss/policy_avg": 0.06312263011932373, "lr": 9.870270961145196e-06, "objective/entropy": 132.99948120117188, "objective/kl": 22.432659149169922, "objective/non_score_reward": -1.1216330528259277, "objective/rlhf_reward": -2.8246725253468616, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 93.43849182128906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.568359375, "step": 203, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995737075805664 }, { "episode": 3280, "epoch": 0.01965225102155756, "loss/policy_avg": 0.6064414978027344, "lr": 9.869631901840491e-06, "objective/entropy": -19.207683563232422, "objective/kl": 18.83993148803711, "objective/non_score_reward": -0.9419965744018555, "objective/rlhf_reward": -2.3173880978540033, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 90.60572052001953, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4931640625, "step": 204, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000382661819458 }, { "episode": 3296, "epoch": 0.01974811566068711, "loss/policy_avg": 0.2940763831138611, "lr": 9.868992842535788e-06, "objective/entropy": 83.77371978759766, "objective/kl": 25.884700775146484, "objective/non_score_reward": -1.2942349910736084, "objective/rlhf_reward": -3.3521112903681507, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 39.873409271240234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.447265625, "step": 205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972522258758545 }, { "episode": 3312, "epoch": 0.019843980299816658, "loss/policy_avg": 0.18257562816143036, "lr": 9.868353783231085e-06, "objective/entropy": 119.6646728515625, "objective/kl": 27.568458557128906, "objective/non_score_reward": -1.3784228563308716, "objective/rlhf_reward": -1.1136915445327755, "objective/scores": 1.1, "policy/approxkl_avg": 48.24208068847656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.66796875, "step": 206, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987564086914062 }, { "episode": 3328, "epoch": 0.019939844938946207, "loss/policy_avg": -0.011964879930019379, "lr": 9.867714723926382e-06, "objective/entropy": 79.78416442871094, "objective/kl": 24.409799575805664, "objective/non_score_reward": -1.2204899787902832, "objective/rlhf_reward": -3.5033578658975184, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 19.269145965576172, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4658203125, "step": 207, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000608205795288 }, { "episode": 3344, "epoch": 0.020035709578075756, "loss/policy_avg": 0.04908262565732002, "lr": 9.867075664621679e-06, "objective/entropy": 174.413818359375, "objective/kl": 24.83539581298828, "objective/non_score_reward": -1.241769790649414, "objective/rlhf_reward": -3.3629594779649548, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 14.995980262756348, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.54296875, "step": 208, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9985637664794922 }, { "episode": 3360, "epoch": 0.020131574217205305, "loss/policy_avg": 0.14710021018981934, "lr": 9.866436605316974e-06, "objective/entropy": 132.51194763183594, "objective/kl": 29.743432998657227, "objective/non_score_reward": -1.4871716499328613, "objective/rlhf_reward": -4.344566795889454, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 65.08041381835938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.490234375, "step": 209, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0021185874938965 }, { "episode": 3376, "epoch": 0.020227438856334854, "loss/policy_avg": 0.0796532854437828, "lr": 9.86579754601227e-06, "objective/entropy": 1.3461151123046875, "objective/kl": 26.279298782348633, "objective/non_score_reward": -1.313965082168579, "objective/rlhf_reward": -0.8558599710464474, "objective/scores": 1.1, "policy/approxkl_avg": 105.49284362792969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989545345306396 }, { "episode": 3392, "epoch": 0.020323303495464403, "loss/policy_avg": -0.03664415329694748, "lr": 9.865158486707568e-06, "objective/entropy": -37.266082763671875, "objective/kl": 19.48423957824707, "objective/non_score_reward": -0.9742119908332825, "objective/rlhf_reward": -0.9731288298380103, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 8.304027557373047, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.638671875, "step": 211, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.003216028213501 }, { "episode": 3408, "epoch": 0.020419168134593952, "loss/policy_avg": 0.30985838174819946, "lr": 9.864519427402863e-06, "objective/entropy": 94.80859375, "objective/kl": 29.94342041015625, "objective/non_score_reward": -1.4971709251403809, "objective/rlhf_reward": -4.564851482112971, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 115.7642593383789, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.74609375, "step": 212, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9959279298782349 }, { "episode": 3424, "epoch": 0.0205150327737235, "loss/policy_avg": 0.23234406113624573, "lr": 9.86388036809816e-06, "objective/entropy": 125.32878875732422, "objective/kl": 33.22450637817383, "objective/non_score_reward": -1.6612253189086914, "objective/rlhf_reward": -4.820072407993387, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 82.43852233886719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 213, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001713752746582 }, { "episode": 3440, "epoch": 0.02061089741285305, "loss/policy_avg": 1.5097947120666504, "lr": 9.863241308793457e-06, "objective/entropy": 132.66845703125, "objective/kl": 27.622318267822266, "objective/non_score_reward": -1.3811159133911133, "objective/rlhf_reward": -3.6996345475044956, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 26.179336547851562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.708984375, "step": 214, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993813037872314 }, { "episode": 3456, "epoch": 0.0207067620519826, "loss/policy_avg": 0.12209601700305939, "lr": 9.862602249488753e-06, "objective/entropy": 132.88406372070312, "objective/kl": 26.24971580505371, "objective/non_score_reward": -1.312485694885254, "objective/rlhf_reward": -5.249942898750305, "objective/scores": 0.0, "policy/approxkl_avg": 41.524139404296875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7109375, "step": 215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990661144256592 }, { "episode": 3472, "epoch": 0.02080262669111215, "loss/policy_avg": 0.3654727339744568, "lr": 9.86196319018405e-06, "objective/entropy": 39.344974517822266, "objective/kl": 23.619754791259766, "objective/non_score_reward": -1.18098783493042, "objective/rlhf_reward": -1.8002320870172706, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.19040584564209, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4951171875, "step": 216, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990694522857666 }, { "episode": 3488, "epoch": 0.0208984913302417, "loss/policy_avg": 0.05907230079174042, "lr": 9.861324130879346e-06, "objective/entropy": -49.055564880371094, "objective/kl": 27.70423126220703, "objective/non_score_reward": -1.3852115869522095, "objective/rlhf_reward": -3.8789869598752125, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 62.16511917114258, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.609375, "step": 217, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973804950714111 }, { "episode": 3504, "epoch": 0.02099435596937125, "loss/policy_avg": 0.5758800506591797, "lr": 9.860685071574642e-06, "objective/entropy": 18.1787166595459, "objective/kl": 25.688358306884766, "objective/non_score_reward": -1.2844178676605225, "objective/rlhf_reward": -2.2139523147952285, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 23.39984130859375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.498046875, "step": 218, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974064826965332 }, { "episode": 3520, "epoch": 0.021090220608500798, "loss/policy_avg": 0.2610527575016022, "lr": 9.86004601226994e-06, "objective/entropy": -68.09791564941406, "objective/kl": 26.7615966796875, "objective/non_score_reward": -1.3380796909332275, "objective/rlhf_reward": -4.026806149512453, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 124.13450622558594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4599609375, "step": 219, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986741542816162 }, { "episode": 3536, "epoch": 0.021186085247630347, "loss/policy_avg": 0.1624567210674286, "lr": 9.859406952965236e-06, "objective/entropy": -113.99856567382812, "objective/kl": 19.689868927001953, "objective/non_score_reward": -0.9844935536384583, "objective/rlhf_reward": -2.113145466121744, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 45.295875549316406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0004196166992188 }, { "episode": 3552, "epoch": 0.021281949886759896, "loss/policy_avg": 0.13548433780670166, "lr": 9.858767893660533e-06, "objective/entropy": 154.66708374023438, "objective/kl": 31.08365249633789, "objective/non_score_reward": -1.554182529449463, "objective/rlhf_reward": -4.554870968282805, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 43.560997009277344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7265625, "step": 221, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972419738769531 }, { "episode": 3568, "epoch": 0.021377814525889445, "loss/policy_avg": 0.04025420919060707, "lr": 9.858128834355828e-06, "objective/entropy": 145.02468872070312, "objective/kl": 31.459678649902344, "objective/non_score_reward": -1.572983980178833, "objective/rlhf_reward": -4.932686292861385, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 41.05935287475586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4560546875, "step": 222, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0009706020355225 }, { "episode": 3584, "epoch": 0.021473679165018994, "loss/policy_avg": 1.5885295867919922, "lr": 9.857489775051125e-06, "objective/entropy": 141.5781707763672, "objective/kl": 34.53314971923828, "objective/non_score_reward": -1.726657509803772, "objective/rlhf_reward": -5.244770532072174, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 37.03607177734375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.658203125, "step": 223, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.992570400238037 }, { "episode": 3600, "epoch": 0.021569543804148543, "loss/policy_avg": 0.9811650514602661, "lr": 9.856850715746422e-06, "objective/entropy": -30.946441650390625, "objective/kl": 29.145998001098633, "objective/non_score_reward": -1.4572999477386475, "objective/rlhf_reward": -4.450597622481686, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 19.481060028076172, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.462890625, "step": 224, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983983039855957 }, { "episode": 3616, "epoch": 0.021665408443278092, "loss/policy_avg": 0.5196128487586975, "lr": 9.856211656441719e-06, "objective/entropy": -16.55962371826172, "objective/kl": 28.4706974029541, "objective/non_score_reward": -1.423534870147705, "objective/rlhf_reward": -3.5714332482972484, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 117.12289428710938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.732421875, "step": 225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975980520248413 }, { "episode": 3632, "epoch": 0.02176127308240764, "loss/policy_avg": 0.6528609395027161, "lr": 9.855572597137016e-06, "objective/entropy": 136.64077758789062, "objective/kl": 32.46646499633789, "objective/non_score_reward": -1.6233232021331787, "objective/rlhf_reward": -2.093292927742004, "objective/scores": 1.1, "policy/approxkl_avg": 44.35145950317383, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.669921875, "step": 226, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994146823883057 }, { "episode": 3648, "epoch": 0.02185713772153719, "loss/policy_avg": 0.9434906244277954, "lr": 9.854933537832313e-06, "objective/entropy": -36.75615310668945, "objective/kl": 31.890575408935547, "objective/non_score_reward": -1.5945286750793457, "objective/rlhf_reward": -5.052601966887636, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 65.19577026367188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.59375, "step": 227, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979374408721924 }, { "episode": 3664, "epoch": 0.02195300236066674, "loss/policy_avg": 0.36130765080451965, "lr": 9.854294478527608e-06, "objective/entropy": 47.61101531982422, "objective/kl": 18.669593811035156, "objective/non_score_reward": -0.9334796071052551, "objective/rlhf_reward": -2.3339184284210206, "objective/scores": 0.35, "policy/approxkl_avg": 15.266149520874023, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.578125, "step": 228, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9959979057312012 }, { "episode": 3680, "epoch": 0.022048866999796288, "loss/policy_avg": 0.18321090936660767, "lr": 9.853655419222905e-06, "objective/entropy": 116.60293579101562, "objective/kl": 27.56112289428711, "objective/non_score_reward": -1.378056287765503, "objective/rlhf_reward": -3.5648136837052657, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 29.471284866333008, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.537109375, "step": 229, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991540908813477 }, { "episode": 3696, "epoch": 0.022144731638925837, "loss/policy_avg": -0.044996485114097595, "lr": 9.853016359918202e-06, "objective/entropy": 38.275238037109375, "objective/kl": 28.720836639404297, "objective/non_score_reward": -1.4360418319702148, "objective/rlhf_reward": -4.187907754388407, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 173.6102752685547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.552734375, "step": 230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997882604598999 }, { "episode": 3712, "epoch": 0.022240596278055386, "loss/policy_avg": 0.027855467051267624, "lr": 9.852377300613498e-06, "objective/entropy": 123.59611511230469, "objective/kl": 30.175601959228516, "objective/non_score_reward": -1.5087801218032837, "objective/rlhf_reward": -4.478861062732294, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 50.733642578125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.37109375, "step": 231, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0003466606140137 }, { "episode": 3728, "epoch": 0.022336460917184935, "loss/policy_avg": -0.3093503713607788, "lr": 9.851738241308795e-06, "objective/entropy": 0.438995361328125, "objective/kl": 27.025171279907227, "objective/non_score_reward": -1.3512585163116455, "objective/rlhf_reward": -5.405034303665161, "objective/scores": 0.0, "policy/approxkl_avg": 13.092641830444336, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.615234375, "step": 232, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000675678253174 }, { "episode": 3744, "epoch": 0.022432325556314484, "loss/policy_avg": -0.05236402899026871, "lr": 9.85109918200409e-06, "objective/entropy": 112.74819946289062, "objective/kl": 24.94538688659668, "objective/non_score_reward": -1.2472693920135498, "objective/rlhf_reward": -3.473305845054325, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 19.200075149536133, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.3583984375, "step": 233, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.002117395401001 }, { "episode": 3760, "epoch": 0.022528190195444033, "loss/policy_avg": 0.21103611588478088, "lr": 9.850460122699387e-06, "objective/entropy": 73.77043151855469, "objective/kl": 28.00216293334961, "objective/non_score_reward": -1.4001080989837646, "objective/rlhf_reward": -3.6530211669968917, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 13.291183471679688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5009765625, "step": 234, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995331764221191 }, { "episode": 3776, "epoch": 0.02262405483457358, "loss/policy_avg": 0.6418443918228149, "lr": 9.849821063394683e-06, "objective/entropy": 19.92426300048828, "objective/kl": 31.282997131347656, "objective/non_score_reward": -1.5641499757766724, "objective/rlhf_reward": -4.931086901456041, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 98.59768676757812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.34375, "step": 235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0005524158477783 }, { "episode": 3792, "epoch": 0.02271991947370313, "loss/policy_avg": 0.20836295187473297, "lr": 9.84918200408998e-06, "objective/entropy": 28.238201141357422, "objective/kl": 29.105060577392578, "objective/non_score_reward": -1.455253005027771, "objective/rlhf_reward": -4.264752714839533, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 34.374176025390625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.51171875, "step": 236, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989919662475586 }, { "episode": 3808, "epoch": 0.02281578411283268, "loss/policy_avg": 0.43571943044662476, "lr": 9.848542944785276e-06, "objective/entropy": 144.94302368164062, "objective/kl": 33.369178771972656, "objective/non_score_reward": -1.6684589385986328, "objective/rlhf_reward": -5.314585768912716, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 113.68771362304688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.607421875, "step": 237, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996192216873169 }, { "episode": 3824, "epoch": 0.02291164875196223, "loss/policy_avg": 0.14893671870231628, "lr": 9.847903885480573e-06, "objective/entropy": 186.38681030273438, "objective/kl": 41.077842712402344, "objective/non_score_reward": -2.0538923740386963, "objective/rlhf_reward": -6.611449215475636, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 168.3666229248047, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.755859375, "step": 238, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984937906265259 }, { "episode": 3840, "epoch": 0.023007513391091777, "loss/policy_avg": 0.07648584991693497, "lr": 9.84726482617587e-06, "objective/entropy": -37.23631286621094, "objective/kl": 25.318248748779297, "objective/non_score_reward": -1.2659125328063965, "objective/rlhf_reward": -3.5073907067447454, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 50.266414642333984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48828125, "step": 239, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979995489120483 }, { "episode": 3856, "epoch": 0.023103378030221326, "loss/policy_avg": -0.15926438570022583, "lr": 9.846625766871167e-06, "objective/entropy": 37.868736267089844, "objective/kl": 27.493305206298828, "objective/non_score_reward": -1.3746652603149414, "objective/rlhf_reward": -4.173148546248598, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 6.63505220413208, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5390625, "step": 240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0081896781921387 }, { "episode": 3872, "epoch": 0.023199242669350875, "loss/policy_avg": 0.14562831819057465, "lr": 9.845986707566462e-06, "objective/entropy": 15.188220977783203, "objective/kl": 28.046958923339844, "objective/non_score_reward": -1.4023480415344238, "objective/rlhf_reward": -4.1587937875703425, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 43.238990783691406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.578125, "step": 241, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996706247329712 }, { "episode": 3888, "epoch": 0.023295107308480424, "loss/policy_avg": 0.11054911464452744, "lr": 9.845347648261759e-06, "objective/entropy": 65.03858947753906, "objective/kl": 30.087387084960938, "objective/non_score_reward": -1.5043694972991943, "objective/rlhf_reward": -4.070066402630742, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 3.83949613571167, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.587890625, "step": 242, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988956451416016 }, { "episode": 3904, "epoch": 0.023390971947609973, "loss/policy_avg": 0.3941475749015808, "lr": 9.844708588957056e-06, "objective/entropy": 59.93316650390625, "objective/kl": 25.623512268066406, "objective/non_score_reward": -1.2811756134033203, "objective/rlhf_reward": -3.52058264977129, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 78.30380249023438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5859375, "step": 243, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990283250808716 }, { "episode": 3920, "epoch": 0.023486836586739522, "loss/policy_avg": 0.19095474481582642, "lr": 9.844069529652353e-06, "objective/entropy": 31.422988891601562, "objective/kl": 24.865825653076172, "objective/non_score_reward": -1.2432913780212402, "objective/rlhf_reward": -3.2398319403330484, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 38.12981033325195, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.53125, "step": 244, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.004727840423584 }, { "episode": 3936, "epoch": 0.023582701225869074, "loss/policy_avg": 0.049357250332832336, "lr": 9.84343047034765e-06, "objective/entropy": 21.297576904296875, "objective/kl": 35.60150146484375, "objective/non_score_reward": -1.7800750732421875, "objective/rlhf_reward": -5.720300531387329, "objective/scores": 0.35, "policy/approxkl_avg": 38.869449615478516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4716796875, "step": 245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0019991397857666 }, { "episode": 3952, "epoch": 0.023678565864998623, "loss/policy_avg": 0.7713517546653748, "lr": 9.842791411042945e-06, "objective/entropy": 53.62720489501953, "objective/kl": 31.218942642211914, "objective/non_score_reward": -1.5609471797943115, "objective/rlhf_reward": -4.296377490239079, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 48.73869323730469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7265625, "step": 246, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975836277008057 }, { "episode": 3968, "epoch": 0.023774430504128172, "loss/policy_avg": 0.008143262937664986, "lr": 9.842152351738242e-06, "objective/entropy": 171.02789306640625, "objective/kl": 34.79176330566406, "objective/non_score_reward": -1.7395880222320557, "objective/rlhf_reward": -5.296492939413176, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 21.7828369140625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.57421875, "step": 247, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9989376068115234 }, { "episode": 3984, "epoch": 0.02387029514325772, "loss/policy_avg": -0.12264247238636017, "lr": 9.841513292433539e-06, "objective/entropy": 80.24577331542969, "objective/kl": 33.11949920654297, "objective/non_score_reward": -1.6559748649597168, "objective/rlhf_reward": -4.799071069034646, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 61.87395477294922, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4599609375, "step": 248, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.003005027770996 }, { "episode": 4000, "epoch": 0.02396615978238727, "loss/policy_avg": 0.2658330202102661, "lr": 9.840874233128836e-06, "objective/entropy": 149.58941650390625, "objective/kl": 29.3863525390625, "objective/non_score_reward": -1.4693175554275513, "objective/rlhf_reward": -4.273150358263569, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 58.66055679321289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.51171875, "step": 249, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972658157348633 }, { "episode": 4016, "epoch": 0.02406202442151682, "loss/policy_avg": 0.09115779399871826, "lr": 9.840235173824132e-06, "objective/entropy": 147.28927612304688, "objective/kl": 31.492679595947266, "objective/non_score_reward": -1.5746338367462158, "objective/rlhf_reward": -4.939285838340206, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 28.799278259277344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.796875, "step": 250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002530574798584 }, { "episode": 4032, "epoch": 0.024157889060646368, "loss/policy_avg": 0.09398385882377625, "lr": 9.83959611451943e-06, "objective/entropy": -45.248435974121094, "objective/kl": 28.402175903320312, "objective/non_score_reward": -1.4201087951660156, "objective/rlhf_reward": -4.018575882137405, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 19.838550567626953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.517578125, "step": 251, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9973764419555664 }, { "episode": 4048, "epoch": 0.024253753699775917, "loss/policy_avg": 0.19270983338356018, "lr": 9.838957055214724e-06, "objective/entropy": 77.1705093383789, "objective/kl": 34.050987243652344, "objective/non_score_reward": -1.7025493383407593, "objective/rlhf_reward": -5.076863960425058, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 18.725093841552734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4814453125, "step": 252, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001063823699951 }, { "episode": 4064, "epoch": 0.024349618338905466, "loss/policy_avg": 0.4652649164199829, "lr": 9.838317995910021e-06, "objective/entropy": 257.7345886230469, "objective/kl": 24.133747100830078, "objective/non_score_reward": -1.2066874504089355, "objective/rlhf_reward": -3.4481475735581935, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 41.46368408203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.80078125, "step": 253, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9955878257751465 }, { "episode": 4080, "epoch": 0.024445482978035015, "loss/policy_avg": 0.14692571759223938, "lr": 9.837678936605318e-06, "objective/entropy": 43.00188064575195, "objective/kl": 24.73518180847168, "objective/non_score_reward": -1.236759066581726, "objective/rlhf_reward": -3.568433978644711, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 75.05264282226562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 254, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990639686584473 }, { "episode": 4096, "epoch": 0.024541347617164564, "loss/policy_avg": 0.08271847665309906, "lr": 9.837039877300615e-06, "objective/entropy": -79.57066345214844, "objective/kl": 26.90784454345703, "objective/non_score_reward": -1.3453922271728516, "objective/rlhf_reward": -3.648235575358073, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 24.23294448852539, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.619140625, "step": 255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984873533248901 }, { "episode": 4112, "epoch": 0.024637212256294113, "loss/policy_avg": 0.12403183430433273, "lr": 9.83640081799591e-06, "objective/entropy": 87.87326049804688, "objective/kl": 29.708419799804688, "objective/non_score_reward": -1.4854209423065186, "objective/rlhf_reward": -4.116855438026499, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 32.65428161621094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.62890625, "step": 256, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981478452682495 }, { "episode": 4128, "epoch": 0.024733076895423662, "loss/policy_avg": -0.17764857411384583, "lr": 9.835761758691207e-06, "objective/entropy": 130.6345977783203, "objective/kl": 34.35237121582031, "objective/non_score_reward": -1.717618465423584, "objective/rlhf_reward": -5.314214794841364, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 118.99533081054688, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.3974609375, "step": 257, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.014057159423828 }, { "episode": 4144, "epoch": 0.02482894153455321, "loss/policy_avg": 2.400163173675537, "lr": 9.835122699386504e-06, "objective/entropy": 123.72301483154297, "objective/kl": 21.25601577758789, "objective/non_score_reward": -1.0628007650375366, "objective/rlhf_reward": 0.1487968802452091, "objective/scores": 1.1, "policy/approxkl_avg": 36.07887268066406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.572265625, "step": 258, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998108148574829 }, { "episode": 4160, "epoch": 0.02492480617368276, "loss/policy_avg": 0.3900964856147766, "lr": 9.8344836400818e-06, "objective/entropy": 233.3748321533203, "objective/kl": 42.447425842285156, "objective/non_score_reward": -2.1223714351654053, "objective/rlhf_reward": -5.5657667263757915, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 19.722026824951172, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.74609375, "step": 259, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000584363937378 }, { "episode": 4176, "epoch": 0.02502067081281231, "loss/policy_avg": 0.3361247181892395, "lr": 9.833844580777096e-06, "objective/entropy": 135.13961791992188, "objective/kl": 31.25783920288086, "objective/non_score_reward": -1.5628920793533325, "objective/rlhf_reward": -4.426739449771952, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 16.49414825439453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.486328125, "step": 260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986810684204102 }, { "episode": 4192, "epoch": 0.025116535451941858, "loss/policy_avg": 0.1438344419002533, "lr": 9.833205521472393e-06, "objective/entropy": 104.18168640136719, "objective/kl": 35.72525405883789, "objective/non_score_reward": -1.7862627506256104, "objective/rlhf_reward": -5.320222015651773, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.100770950317383, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65625, "step": 261, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996044635772705 }, { "episode": 4208, "epoch": 0.025212400091071407, "loss/policy_avg": 2.402132034301758, "lr": 9.83256646216769e-06, "objective/entropy": 91.16908264160156, "objective/kl": 29.633235931396484, "objective/non_score_reward": -1.4816619157791138, "objective/rlhf_reward": -4.476049522967681, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 43.586891174316406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.64453125, "step": 262, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.004427433013916 }, { "episode": 4224, "epoch": 0.025308264730200956, "loss/policy_avg": 0.7259080410003662, "lr": 9.831927402862987e-06, "objective/entropy": 154.68115234375, "objective/kl": 37.00696563720703, "objective/non_score_reward": -1.8503483533859253, "objective/rlhf_reward": -5.576564307483743, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 16.052043914794922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.484375, "step": 263, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9974215030670166 }, { "episode": 4240, "epoch": 0.025404129369330505, "loss/policy_avg": 0.09373458474874496, "lr": 9.831288343558284e-06, "objective/entropy": 72.85606384277344, "objective/kl": 27.522302627563477, "objective/non_score_reward": -1.376115083694458, "objective/rlhf_reward": -3.679631943973612, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 142.1138916015625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.310546875, "step": 264, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991629123687744 }, { "episode": 4256, "epoch": 0.025499994008460054, "loss/policy_avg": 0.7555310130119324, "lr": 9.830649284253579e-06, "objective/entropy": 72.61222076416016, "objective/kl": 30.647029876708984, "objective/non_score_reward": -1.5323514938354492, "objective/rlhf_reward": -4.705573756893244, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 54.394874572753906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.587890625, "step": 265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0006964206695557 }, { "episode": 4272, "epoch": 0.025595858647589603, "loss/policy_avg": 0.6551899313926697, "lr": 9.830010224948876e-06, "objective/entropy": 121.19924926757812, "objective/kl": 33.96527099609375, "objective/non_score_reward": -1.6982636451721191, "objective/rlhf_reward": -5.131195192754852, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 40.39656066894531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.470703125, "step": 266, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999192476272583 }, { "episode": 4288, "epoch": 0.02569172328671915, "loss/policy_avg": 1.1016074419021606, "lr": 9.829371165644173e-06, "objective/entropy": 132.00601196289062, "objective/kl": 43.09049987792969, "objective/non_score_reward": -2.154524803161621, "objective/rlhf_reward": -7.102327191623386, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 126.27546691894531, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3935546875, "step": 267, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9990718364715576 }, { "episode": 4304, "epoch": 0.0257875879258487, "loss/policy_avg": 0.08981708437204361, "lr": 9.82873210633947e-06, "objective/entropy": 140.80239868164062, "objective/kl": 26.626178741455078, "objective/non_score_reward": -1.3313090801239014, "objective/rlhf_reward": -0.9252360224723812, "objective/scores": 1.1, "policy/approxkl_avg": 84.53665924072266, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.841796875, "step": 268, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997150182723999 }, { "episode": 4320, "epoch": 0.02588345256497825, "loss/policy_avg": 0.565528929233551, "lr": 9.828093047034766e-06, "objective/entropy": 138.6593017578125, "objective/kl": 32.08763885498047, "objective/non_score_reward": -1.604382038116455, "objective/rlhf_reward": -4.813408408228474, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 34.42543029785156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.427734375, "step": 269, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0018558502197266 }, { "episode": 4336, "epoch": 0.0259793172041078, "loss/policy_avg": 0.4312899708747864, "lr": 9.827453987730061e-06, "objective/entropy": 20.17654800415039, "objective/kl": 23.528181076049805, "objective/non_score_reward": -1.176409125328064, "objective/rlhf_reward": -2.5829304478326183, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 20.440711975097656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7890625, "step": 270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989676475524902 }, { "episode": 4352, "epoch": 0.026075181843237347, "loss/policy_avg": 0.20729105174541473, "lr": 9.826814928425358e-06, "objective/entropy": 166.21115112304688, "objective/kl": 31.01326560974121, "objective/non_score_reward": -1.5506633520126343, "objective/rlhf_reward": -6.202653288841248, "objective/scores": 0.0, "policy/approxkl_avg": 34.41830825805664, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.654296875, "step": 271, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.003887176513672 }, { "episode": 4368, "epoch": 0.026171046482366896, "loss/policy_avg": 3.2944061756134033, "lr": 9.826175869120655e-06, "objective/entropy": 28.755096435546875, "objective/kl": 31.482175827026367, "objective/non_score_reward": -1.5741088390350342, "objective/rlhf_reward": -4.917833187667233, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.366632461547852, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3701171875, "step": 272, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0053317546844482 }, { "episode": 4384, "epoch": 0.02626691112149645, "loss/policy_avg": 0.23004142940044403, "lr": 9.825536809815952e-06, "objective/entropy": 54.82402038574219, "objective/kl": 32.45307922363281, "objective/non_score_reward": -1.6226541996002197, "objective/rlhf_reward": -5.148980966120391, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 31.775432586669922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.525390625, "step": 273, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995795488357544 }, { "episode": 4400, "epoch": 0.026362775760625998, "loss/policy_avg": -0.08435960114002228, "lr": 9.824897750511249e-06, "objective/entropy": 98.25897216796875, "objective/kl": 28.68474578857422, "objective/non_score_reward": -1.4342372417449951, "objective/rlhf_reward": -5.73694920539856, "objective/scores": 0.0, "policy/approxkl_avg": 72.97157287597656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.517578125, "step": 274, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0012598037719727 }, { "episode": 4416, "epoch": 0.026458640399755547, "loss/policy_avg": 0.41626134514808655, "lr": 9.824258691206546e-06, "objective/entropy": 83.60694885253906, "objective/kl": 30.977035522460938, "objective/non_score_reward": -1.548851728439331, "objective/rlhf_reward": -4.795407152175903, "objective/scores": 0.35, "policy/approxkl_avg": 39.04691696166992, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.60546875, "step": 275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0021119117736816 }, { "episode": 4432, "epoch": 0.026554505038885096, "loss/policy_avg": 0.43957769870758057, "lr": 9.823619631901841e-06, "objective/entropy": 127.34529113769531, "objective/kl": 35.28544616699219, "objective/non_score_reward": -1.7642724514007568, "objective/rlhf_reward": -5.606491903872833, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 150.78646850585938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.748046875, "step": 276, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9970619678497314 }, { "episode": 4448, "epoch": 0.026650369678014645, "loss/policy_avg": 0.8086847066879272, "lr": 9.822980572597138e-06, "objective/entropy": -119.74644470214844, "objective/kl": 26.706302642822266, "objective/non_score_reward": -1.335315227508545, "objective/rlhf_reward": -3.9412606716156002, "objective/scores": 0.35, "policy/approxkl_avg": 65.78569793701172, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.568359375, "step": 277, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986886978149414 }, { "episode": 4464, "epoch": 0.026746234317144194, "loss/policy_avg": 0.09760895371437073, "lr": 9.822341513292433e-06, "objective/entropy": 209.31890869140625, "objective/kl": 41.666831970214844, "objective/non_score_reward": -2.083341598510742, "objective/rlhf_reward": -6.7292466498056225, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 14.525606155395508, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 278, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980053901672363 }, { "episode": 4480, "epoch": 0.026842098956273742, "loss/policy_avg": 0.0820450559258461, "lr": 9.82170245398773e-06, "objective/entropy": 152.01095581054688, "objective/kl": 29.104724884033203, "objective/non_score_reward": -1.4552361965179443, "objective/rlhf_reward": -4.159085219324218, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 21.12679100036621, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4560546875, "step": 279, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.99649977684021 }, { "episode": 4496, "epoch": 0.02693796359540329, "loss/policy_avg": 0.08112587034702301, "lr": 9.821063394683027e-06, "objective/entropy": 49.22539138793945, "objective/kl": 32.40191650390625, "objective/non_score_reward": -1.6200958490371704, "objective/rlhf_reward": -5.029785375209197, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.874902725219727, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.404296875, "step": 280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0027780532836914 }, { "episode": 4512, "epoch": 0.02703382823453284, "loss/policy_avg": 0.41851094365119934, "lr": 9.820424335378324e-06, "objective/entropy": 108.13827514648438, "objective/kl": 44.792015075683594, "objective/non_score_reward": -2.239600658416748, "objective/rlhf_reward": -7.133574362072061, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 67.72032165527344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.537109375, "step": 281, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979641437530518 }, { "episode": 4528, "epoch": 0.02712969287366239, "loss/policy_avg": 0.8327301144599915, "lr": 9.81978527607362e-06, "objective/entropy": 70.98486328125, "objective/kl": 43.82145690917969, "objective/non_score_reward": -2.191072702407837, "objective/rlhf_reward": -7.283338430340647, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.1268585920333862, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.427734375, "step": 282, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002413749694824 }, { "episode": 4544, "epoch": 0.02722555751279194, "loss/policy_avg": 0.26003268361091614, "lr": 9.819146216768916e-06, "objective/entropy": 59.813140869140625, "objective/kl": 32.33997344970703, "objective/non_score_reward": -1.6169987916946411, "objective/rlhf_reward": -4.643166418346476, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 108.00172424316406, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.60546875, "step": 283, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996032953262329 }, { "episode": 4560, "epoch": 0.027321422151921487, "loss/policy_avg": 0.06828334182500839, "lr": 9.818507157464213e-06, "objective/entropy": 164.7733154296875, "objective/kl": 36.976539611816406, "objective/non_score_reward": -1.8488272428512573, "objective/rlhf_reward": -5.791188750330525, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 22.712989807128906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.498046875, "step": 284, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998927354812622 }, { "episode": 4576, "epoch": 0.027417286791051036, "loss/policy_avg": 0.346102774143219, "lr": 9.81786809815951e-06, "objective/entropy": 141.91213989257812, "objective/kl": 29.89690589904785, "objective/non_score_reward": -1.4948452711105347, "objective/rlhf_reward": -4.5793810248374935, "objective/scores": 0.35, "policy/approxkl_avg": 4.914261817932129, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.583984375, "step": 285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991732835769653 }, { "episode": 4592, "epoch": 0.027513151430180585, "loss/policy_avg": 0.07111110538244247, "lr": 9.817229038854806e-06, "objective/entropy": -41.44879150390625, "objective/kl": 29.296417236328125, "objective/non_score_reward": -1.4648208618164062, "objective/rlhf_reward": -4.4806815172113, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 70.16557312011719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5625, "step": 286, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982048273086548 }, { "episode": 4608, "epoch": 0.027609016069310134, "loss/policy_avg": 0.6204440593719482, "lr": 9.816589979550103e-06, "objective/entropy": 10.609687805175781, "objective/kl": 34.5562744140625, "objective/non_score_reward": -1.727813720703125, "objective/rlhf_reward": -5.552005314563198, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 44.11948776245117, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4677734375, "step": 287, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9960062503814697 }, { "episode": 4624, "epoch": 0.027704880708439683, "loss/policy_avg": -0.3703474700450897, "lr": 9.8159509202454e-06, "objective/entropy": 16.20748519897461, "objective/kl": 40.348899841308594, "objective/non_score_reward": -2.0174450874328613, "objective/rlhf_reward": -5.6697804689407345, "objective/scores": 0.6, "policy/approxkl_avg": 58.94084167480469, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.451171875, "step": 288, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000488519668579 }, { "episode": 4640, "epoch": 0.027800745347569232, "loss/policy_avg": 0.691341757774353, "lr": 9.815311860940695e-06, "objective/entropy": 164.64894104003906, "objective/kl": 35.96034240722656, "objective/non_score_reward": -1.7980170249938965, "objective/rlhf_reward": -2.792067980766296, "objective/scores": 1.1, "policy/approxkl_avg": 105.621826171875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.53125, "step": 289, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9972904920578003 }, { "episode": 4656, "epoch": 0.02789660998669878, "loss/policy_avg": 0.05122673511505127, "lr": 9.814672801635992e-06, "objective/entropy": 143.17758178710938, "objective/kl": 27.651023864746094, "objective/non_score_reward": -1.3825511932373047, "objective/rlhf_reward": -2.6064857586633887, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 30.806257247924805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.462890625, "step": 290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996225118637085 }, { "episode": 4672, "epoch": 0.02799247462582833, "loss/policy_avg": -0.021466929465532303, "lr": 9.81403374233129e-06, "objective/entropy": 123.44010925292969, "objective/kl": 18.645748138427734, "objective/non_score_reward": -0.9322873950004578, "objective/rlhf_reward": -2.403636608153505, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 24.915597915649414, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.638671875, "step": 291, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.995213270187378 }, { "episode": 4688, "epoch": 0.02808833926495788, "loss/policy_avg": 0.700859785079956, "lr": 9.813394683026586e-06, "objective/entropy": 58.48292922973633, "objective/kl": 28.2305965423584, "objective/non_score_reward": -1.411529779434204, "objective/rlhf_reward": -4.24611941576004, "objective/scores": 0.35, "policy/approxkl_avg": 21.04977035522461, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4208984375, "step": 292, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981340169906616 }, { "episode": 4704, "epoch": 0.028184203904087428, "loss/policy_avg": 0.9605820775032043, "lr": 9.812755623721883e-06, "objective/entropy": -33.6519775390625, "objective/kl": 33.635501861572266, "objective/non_score_reward": -1.6817750930786133, "objective/rlhf_reward": -5.065241103590118, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 18.019363403320312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4619140625, "step": 293, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000504493713379 }, { "episode": 4720, "epoch": 0.028280068543216977, "loss/policy_avg": 0.44443511962890625, "lr": 9.81211656441718e-06, "objective/entropy": 61.81305694580078, "objective/kl": 37.54548263549805, "objective/non_score_reward": -1.8772742748260498, "objective/rlhf_reward": -6.130494453994137, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 34.736690521240234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.701171875, "step": 294, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981006383895874 }, { "episode": 4736, "epoch": 0.028375933182346526, "loss/policy_avg": -0.004817202687263489, "lr": 9.811477505112475e-06, "objective/entropy": -85.25079345703125, "objective/kl": 22.125272750854492, "objective/non_score_reward": -1.1062637567520142, "objective/rlhf_reward": -3.0658050415262412, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 39.945377349853516, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.619140625, "step": 295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001112937927246 }, { "episode": 4752, "epoch": 0.028471797821476075, "loss/policy_avg": -0.018911486491560936, "lr": 9.810838445807772e-06, "objective/entropy": 187.50953674316406, "objective/kl": 31.752737045288086, "objective/non_score_reward": -1.587636947631836, "objective/rlhf_reward": -4.525718684467386, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 41.095298767089844, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.560546875, "step": 296, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0318245887756348 }, { "episode": 4768, "epoch": 0.028567662460605624, "loss/policy_avg": 0.5813855528831482, "lr": 9.810199386503069e-06, "objective/entropy": 13.395767211914062, "objective/kl": 29.76428985595703, "objective/non_score_reward": -1.4882144927978516, "objective/rlhf_reward": -4.219524757067362, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 58.40808868408203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62109375, "step": 297, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971027374267578 }, { "episode": 4784, "epoch": 0.028663527099735173, "loss/policy_avg": 0.25174012780189514, "lr": 9.809560327198366e-06, "objective/entropy": 93.99857330322266, "objective/kl": 31.07823944091797, "objective/non_score_reward": -1.5539120435714722, "objective/rlhf_reward": -4.482314721743266, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 56.219329833984375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.767578125, "step": 298, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973026514053345 }, { "episode": 4800, "epoch": 0.02875939173886472, "loss/policy_avg": -0.05966740474104881, "lr": 9.808921267893663e-06, "objective/entropy": 199.3701934814453, "objective/kl": 26.15532684326172, "objective/non_score_reward": -1.3077664375305176, "objective/rlhf_reward": -3.7152936098896827, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 18.272422790527344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.650390625, "step": 299, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002091646194458 }, { "episode": 4816, "epoch": 0.02885525637799427, "loss/policy_avg": 0.19725301861763, "lr": 9.808282208588958e-06, "objective/entropy": 112.11613464355469, "objective/kl": 33.344722747802734, "objective/non_score_reward": -1.667236089706421, "objective/rlhf_reward": -6.668944478034973, "objective/scores": 0.0, "policy/approxkl_avg": 29.54242706298828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.640625, "step": 300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0038766860961914 }, { "episode": 4832, "epoch": 0.02895112101712382, "loss/policy_avg": -0.17506346106529236, "lr": 9.807643149284255e-06, "objective/entropy": 70.48281860351562, "objective/kl": 29.51511573791504, "objective/non_score_reward": -1.4757558107376099, "objective/rlhf_reward": -4.387251400741276, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 12.791141510009766, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4814453125, "step": 301, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999895691871643 }, { "episode": 4848, "epoch": 0.029046985656253372, "loss/policy_avg": 0.38140204548835754, "lr": 9.80700408997955e-06, "objective/entropy": 23.643152236938477, "objective/kl": 27.579925537109375, "objective/non_score_reward": -1.3789963722229004, "objective/rlhf_reward": -3.854125951946364, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 8.89024543762207, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 302, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984774589538574 }, { "episode": 4864, "epoch": 0.02914285029538292, "loss/policy_avg": 0.18466374278068542, "lr": 9.806365030674847e-06, "objective/entropy": -30.63671875, "objective/kl": 25.678733825683594, "objective/non_score_reward": -1.2839367389678955, "objective/rlhf_reward": -3.6199750540577735, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 4.08036470413208, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53515625, "step": 303, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999252200126648 }, { "episode": 4880, "epoch": 0.02923871493451247, "loss/policy_avg": 0.20352232456207275, "lr": 9.805725971370144e-06, "objective/entropy": -14.465229034423828, "objective/kl": 16.88151741027832, "objective/non_score_reward": -0.8440757989883423, "objective/rlhf_reward": 1.023696751892567, "objective/scores": 1.1, "policy/approxkl_avg": 16.945369720458984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.51171875, "step": 304, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997638463973999 }, { "episode": 4896, "epoch": 0.02933457957364202, "loss/policy_avg": 0.36892420053482056, "lr": 9.80508691206544e-06, "objective/entropy": 136.53363037109375, "objective/kl": 30.262548446655273, "objective/non_score_reward": -1.513127326965332, "objective/rlhf_reward": -3.1287905319940776, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 30.166175842285156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.615234375, "step": 305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0001025199890137 }, { "episode": 4912, "epoch": 0.029430444212771568, "loss/policy_avg": 0.07577557861804962, "lr": 9.804447852760737e-06, "objective/entropy": 77.17935943603516, "objective/kl": 28.32352638244629, "objective/non_score_reward": -1.4161763191223145, "objective/rlhf_reward": -4.148933493884739, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.6957955360412598, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.63671875, "step": 306, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0025148391723633 }, { "episode": 4928, "epoch": 0.029526308851901117, "loss/policy_avg": 0.1559610664844513, "lr": 9.803808793456034e-06, "objective/entropy": -16.938400268554688, "objective/kl": 21.827743530273438, "objective/non_score_reward": -1.091387152671814, "objective/rlhf_reward": -2.703689043939696, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 7.885660171508789, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.431640625, "step": 307, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0027849674224854 }, { "episode": 4944, "epoch": 0.029622173491030666, "loss/policy_avg": -0.17305535078048706, "lr": 9.80316973415133e-06, "objective/entropy": -31.412694931030273, "objective/kl": 23.805431365966797, "objective/non_score_reward": -1.1902716159820557, "objective/rlhf_reward": -3.1569663322606853, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 35.29633331298828, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.654296875, "step": 308, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0007619857788086 }, { "episode": 4960, "epoch": 0.029718038130160215, "loss/policy_avg": 0.13406828045845032, "lr": 9.802530674846626e-06, "objective/entropy": 68.0604248046875, "objective/kl": 31.641517639160156, "objective/non_score_reward": -1.582075834274292, "objective/rlhf_reward": -4.949701407042843, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 32.652069091796875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3017578125, "step": 309, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981597661972046 }, { "episode": 4976, "epoch": 0.029813902769289764, "loss/policy_avg": 0.3640270233154297, "lr": 9.801891615541923e-06, "objective/entropy": 73.73117065429688, "objective/kl": 22.181957244873047, "objective/non_score_reward": -1.109097957611084, "objective/rlhf_reward": -4.436391651630402, "objective/scores": 0.0, "policy/approxkl_avg": 24.474929809570312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.77734375, "step": 310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988360404968262 }, { "episode": 4992, "epoch": 0.029909767408419313, "loss/policy_avg": 0.598778486251831, "lr": 9.80125255623722e-06, "objective/entropy": 77.45819854736328, "objective/kl": 31.91500473022461, "objective/non_score_reward": -1.5957502126693726, "objective/rlhf_reward": -4.558172132047723, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.392116546630859, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.595703125, "step": 311, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9976756572723389 }, { "episode": 5008, "epoch": 0.03000563204754886, "loss/policy_avg": -0.14829277992248535, "lr": 9.800613496932517e-06, "objective/entropy": 73.91107940673828, "objective/kl": 22.043235778808594, "objective/non_score_reward": -1.1021617650985718, "objective/rlhf_reward": -3.0086471796035763, "objective/scores": 0.35, "policy/approxkl_avg": 7.375496864318848, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.548828125, "step": 312, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0021305084228516 }, { "episode": 5024, "epoch": 0.03010149668667841, "loss/policy_avg": 0.34449532628059387, "lr": 9.799974437627812e-06, "objective/entropy": 27.04425048828125, "objective/kl": 31.98007583618164, "objective/non_score_reward": -1.599003791809082, "objective/rlhf_reward": -4.996015524864196, "objective/scores": 0.35, "policy/approxkl_avg": 53.630210876464844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.529296875, "step": 313, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990134239196777 }, { "episode": 5040, "epoch": 0.03019736132580796, "loss/policy_avg": 0.029857225716114044, "lr": 9.799335378323109e-06, "objective/entropy": 147.96096801757812, "objective/kl": 27.342838287353516, "objective/non_score_reward": -1.3671419620513916, "objective/rlhf_reward": -4.017969946475372, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 18.108400344848633, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.638671875, "step": 314, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9988747835159302 }, { "episode": 5056, "epoch": 0.03029322596493751, "loss/policy_avg": 0.05283927917480469, "lr": 9.798696319018406e-06, "objective/entropy": -46.846099853515625, "objective/kl": 30.715242385864258, "objective/non_score_reward": -1.535762071609497, "objective/rlhf_reward": -4.538928542200642, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 66.26033020019531, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6484375, "step": 315, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992979764938354 }, { "episode": 5072, "epoch": 0.030389090604067057, "loss/policy_avg": 0.2858242094516754, "lr": 9.798057259713703e-06, "objective/entropy": -156.9435577392578, "objective/kl": 31.284622192382812, "objective/non_score_reward": -1.5642311573028564, "objective/rlhf_reward": -4.915288856535583, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 74.38943481445312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7421875, "step": 316, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9992833137512207 }, { "episode": 5088, "epoch": 0.030484955243196606, "loss/policy_avg": 0.28274843096733093, "lr": 9.797418200409e-06, "objective/entropy": -214.69573974609375, "objective/kl": 22.27606201171875, "objective/non_score_reward": -1.1138031482696533, "objective/rlhf_reward": -2.3325063607850414, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 35.48945236206055, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.505859375, "step": 317, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9962687492370605 }, { "episode": 5104, "epoch": 0.030580819882326155, "loss/policy_avg": -0.08736838400363922, "lr": 9.796779141104296e-06, "objective/entropy": -18.148971557617188, "objective/kl": 27.546077728271484, "objective/non_score_reward": -1.377303957939148, "objective/rlhf_reward": -4.1306134844697535, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 76.84832000732422, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6171875, "step": 318, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0014290809631348 }, { "episode": 5120, "epoch": 0.030676684521455704, "loss/policy_avg": 0.031098078936338425, "lr": 9.796140081799592e-06, "objective/entropy": 103.30211639404297, "objective/kl": 27.747032165527344, "objective/non_score_reward": -1.3873515129089355, "objective/rlhf_reward": -4.033634447845158, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 54.69970703125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 319, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985463619232178 }, { "episode": 5136, "epoch": 0.030772549160585253, "loss/policy_avg": 0.3622899651527405, "lr": 9.795501022494888e-06, "objective/entropy": 66.0567398071289, "objective/kl": 26.39444351196289, "objective/non_score_reward": -1.3197221755981445, "objective/rlhf_reward": -3.7226295759349615, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 5.640605449676514, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484375, "step": 320, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992847442626953 }, { "episode": 5152, "epoch": 0.030868413799714802, "loss/policy_avg": -0.10469883680343628, "lr": 9.794861963190185e-06, "objective/entropy": 35.81920623779297, "objective/kl": 25.668739318847656, "objective/non_score_reward": -1.2834370136260986, "objective/rlhf_reward": -3.7337480843067166, "objective/scores": 0.35, "policy/approxkl_avg": 5.808808326721191, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6484375, "step": 321, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999007225036621 }, { "episode": 5168, "epoch": 0.03096427843884435, "loss/policy_avg": -0.2741212248802185, "lr": 9.794222903885482e-06, "objective/entropy": 52.38888168334961, "objective/kl": 34.969974517822266, "objective/non_score_reward": -1.748498797416687, "objective/rlhf_reward": -5.652359655409484, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.913843154907227, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3798828125, "step": 322, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0103840827941895 }, { "episode": 5184, "epoch": 0.0310601430779739, "loss/policy_avg": 0.30122414231300354, "lr": 9.793583844580777e-06, "objective/entropy": 134.16075134277344, "objective/kl": 25.608116149902344, "objective/non_score_reward": -1.280405879020691, "objective/rlhf_reward": -3.1742123318480804, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 74.33633422851562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.603515625, "step": 323, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.014462471008301 }, { "episode": 5200, "epoch": 0.03115600771710345, "loss/policy_avg": 0.26204991340637207, "lr": 9.792944785276074e-06, "objective/entropy": 2.559833526611328, "objective/kl": 25.519519805908203, "objective/non_score_reward": -1.2759759426116943, "objective/rlhf_reward": -3.74465426180212, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 92.09954071044922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.57421875, "step": 324, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998145580291748 }, { "episode": 5216, "epoch": 0.031251872356233, "loss/policy_avg": 0.18864840269088745, "lr": 9.792305725971371e-06, "objective/entropy": 48.99184036254883, "objective/kl": 28.022377014160156, "objective/non_score_reward": -1.4011187553405762, "objective/rlhf_reward": -4.123522403653025, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 22.120746612548828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.60546875, "step": 325, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984712600708008 }, { "episode": 5232, "epoch": 0.03134773699536255, "loss/policy_avg": 0.42162489891052246, "lr": 9.791666666666666e-06, "objective/entropy": -129.23065185546875, "objective/kl": 31.687660217285156, "objective/non_score_reward": -1.5843830108642578, "objective/rlhf_reward": -4.821760052236256, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 111.98194885253906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.587890625, "step": 326, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996734857559204 }, { "episode": 5248, "epoch": 0.0314436016344921, "loss/policy_avg": -0.07900102436542511, "lr": 9.791027607361963e-06, "objective/entropy": 31.351696014404297, "objective/kl": 27.038206100463867, "objective/non_score_reward": -1.3519103527069092, "objective/rlhf_reward": -5.407641291618347, "objective/scores": 0.0, "policy/approxkl_avg": 9.7061767578125, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4423828125, "step": 327, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0043118000030518 }, { "episode": 5264, "epoch": 0.03153946627362165, "loss/policy_avg": 0.16587843000888824, "lr": 9.79038854805726e-06, "objective/entropy": 143.86651611328125, "objective/kl": 27.42593765258789, "objective/non_score_reward": -1.3712968826293945, "objective/rlhf_reward": -4.125937962268276, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 119.49800872802734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.767578125, "step": 328, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999497890472412 }, { "episode": 5280, "epoch": 0.0316353309127512, "loss/policy_avg": 0.29106539487838745, "lr": 9.789749488752557e-06, "objective/entropy": 67.8651351928711, "objective/kl": 32.114479064941406, "objective/non_score_reward": -1.6057239770889282, "objective/rlhf_reward": -5.08126013567987, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 7.976801872253418, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.470703125, "step": 329, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0011985301971436 }, { "episode": 5296, "epoch": 0.031731195551880746, "loss/policy_avg": 0.5780457258224487, "lr": 9.789110429447854e-06, "objective/entropy": 104.15371704101562, "objective/kl": 30.92220687866211, "objective/non_score_reward": -1.5461102724075317, "objective/rlhf_reward": -3.784441030025482, "objective/scores": 0.6, "policy/approxkl_avg": 52.566375732421875, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.55859375, "step": 330, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9993044137954712 }, { "episode": 5312, "epoch": 0.031827060191010295, "loss/policy_avg": 0.24728742241859436, "lr": 9.78847137014315e-06, "objective/entropy": -95.75634765625, "objective/kl": 30.755779266357422, "objective/non_score_reward": -1.5377888679504395, "objective/rlhf_reward": -4.791905486319942, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 32.567970275878906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.666015625, "step": 331, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991464614868164 }, { "episode": 5328, "epoch": 0.031922924830139844, "loss/policy_avg": 1.9531396627426147, "lr": 9.787832310838446e-06, "objective/entropy": 18.057151794433594, "objective/kl": 21.966590881347656, "objective/non_score_reward": -1.0983295440673828, "objective/rlhf_reward": -2.993318116664886, "objective/scores": 0.35, "policy/approxkl_avg": 11.555295944213867, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.716796875, "step": 332, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.004971981048584 }, { "episode": 5344, "epoch": 0.03201878946926939, "loss/policy_avg": 0.0304682869464159, "lr": 9.787193251533743e-06, "objective/entropy": -100.86114501953125, "objective/kl": 21.19540023803711, "objective/non_score_reward": -1.0597699880599976, "objective/rlhf_reward": -2.8604777837670863, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 36.17786407470703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46875, "step": 333, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997187852859497 }, { "episode": 5360, "epoch": 0.03211465410839894, "loss/policy_avg": 0.2974792718887329, "lr": 9.78655419222904e-06, "objective/entropy": 59.0064697265625, "objective/kl": 23.83527183532715, "objective/non_score_reward": -1.1917636394500732, "objective/rlhf_reward": -3.2861017016724343, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 27.08124542236328, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.76171875, "step": 334, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005297660827637 }, { "episode": 5376, "epoch": 0.03221051874752849, "loss/policy_avg": 0.20310130715370178, "lr": 9.785915132924337e-06, "objective/entropy": 51.579200744628906, "objective/kl": 26.064043045043945, "objective/non_score_reward": -1.3032021522521973, "objective/rlhf_reward": -3.656549363341883, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.1224026679992676, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.51171875, "step": 335, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0020689964294434 }, { "episode": 5392, "epoch": 0.03230638338665804, "loss/policy_avg": -0.22360196709632874, "lr": 9.785276073619633e-06, "objective/entropy": 8.019195556640625, "objective/kl": 34.267356872558594, "objective/non_score_reward": -1.7133680582046509, "objective/rlhf_reward": -5.40287409266983, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 27.402694702148438, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.517578125, "step": 336, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.99736487865448 }, { "episode": 5408, "epoch": 0.03240224802578759, "loss/policy_avg": 0.394004225730896, "lr": 9.784637014314929e-06, "objective/entropy": -7.316375732421875, "objective/kl": 34.60337829589844, "objective/non_score_reward": -1.7301688194274902, "objective/rlhf_reward": -3.9969565018427105, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 60.58606719970703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.609375, "step": 337, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990360736846924 }, { "episode": 5424, "epoch": 0.03249811266491714, "loss/policy_avg": 0.08118537068367004, "lr": 9.783997955010226e-06, "objective/entropy": 3.808826446533203, "objective/kl": 33.9757080078125, "objective/non_score_reward": -1.6987853050231934, "objective/rlhf_reward": -5.3713093592720895, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 49.47349548339844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.546875, "step": 338, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974095821380615 }, { "episode": 5440, "epoch": 0.03259397730404669, "loss/policy_avg": 0.1250596046447754, "lr": 9.783358895705522e-06, "objective/entropy": -42.7471809387207, "objective/kl": 27.222618103027344, "objective/non_score_reward": -1.361130952835083, "objective/rlhf_reward": -3.9287524459683265, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 8.669515609741211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.509765625, "step": 339, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002639293670654 }, { "episode": 5456, "epoch": 0.032689841943176236, "loss/policy_avg": 1.2977867126464844, "lr": 9.78271983640082e-06, "objective/entropy": -60.51675796508789, "objective/kl": 27.726932525634766, "objective/non_score_reward": -1.3863465785980225, "objective/rlhf_reward": -4.064433994706034, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 52.59510803222656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4345703125, "step": 340, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984922409057617 }, { "episode": 5472, "epoch": 0.032785706582305785, "loss/policy_avg": 0.10771232098340988, "lr": 9.782080777096116e-06, "objective/entropy": 39.22501754760742, "objective/kl": 38.581573486328125, "objective/non_score_reward": -1.9290788173675537, "objective/rlhf_reward": -6.374679616003662, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 16.336502075195312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.599609375, "step": 341, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990124702453613 }, { "episode": 5488, "epoch": 0.032881571221435334, "loss/policy_avg": 0.029969744384288788, "lr": 9.781441717791413e-06, "objective/entropy": 54.763675689697266, "objective/kl": 27.586057662963867, "objective/non_score_reward": -1.379302978515625, "objective/rlhf_reward": -3.7838785807291666, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 29.997591018676758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4462890625, "step": 342, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992620944976807 }, { "episode": 5504, "epoch": 0.03297743586056488, "loss/policy_avg": -0.003006638027727604, "lr": 9.780802658486708e-06, "objective/entropy": 4.6327056884765625, "objective/kl": 25.01122283935547, "objective/non_score_reward": -1.250560998916626, "objective/rlhf_reward": -3.054833005146916, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.332850694656372, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.51171875, "step": 343, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0042150020599365 }, { "episode": 5520, "epoch": 0.03307330049969443, "loss/policy_avg": -0.2595655918121338, "lr": 9.780163599182005e-06, "objective/entropy": -9.382579803466797, "objective/kl": 25.310394287109375, "objective/non_score_reward": -1.2655197381973267, "objective/rlhf_reward": -3.611480812640533, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 35.86376190185547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.708984375, "step": 344, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991991519927979 }, { "episode": 5536, "epoch": 0.03316916513882398, "loss/policy_avg": 1.6723182201385498, "lr": 9.7795245398773e-06, "objective/entropy": 167.249267578125, "objective/kl": 38.30883026123047, "objective/non_score_reward": -1.915441632270813, "objective/rlhf_reward": -6.283164360610348, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 27.648231506347656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.521484375, "step": 345, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9966659545898438 }, { "episode": 5552, "epoch": 0.03326502977795353, "loss/policy_avg": 0.21136921644210815, "lr": 9.778885480572597e-06, "objective/entropy": 202.48263549804688, "objective/kl": 28.62633514404297, "objective/non_score_reward": -1.4313167333602905, "objective/rlhf_reward": -4.169007628169611, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 28.591995239257812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 346, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9962763786315918 }, { "episode": 5568, "epoch": 0.03336089441708308, "loss/policy_avg": 0.030091844499111176, "lr": 9.778246421267894e-06, "objective/entropy": 178.1235809326172, "objective/kl": 37.731300354003906, "objective/non_score_reward": -1.8865652084350586, "objective/rlhf_reward": -5.990001528468683, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 17.381601333618164, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.52734375, "step": 347, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.001326322555542 }, { "episode": 5584, "epoch": 0.03345675905621263, "loss/policy_avg": 0.40717682242393494, "lr": 9.777607361963191e-06, "objective/entropy": 90.73904418945312, "objective/kl": 31.88462257385254, "objective/non_score_reward": -1.594231128692627, "objective/rlhf_reward": -5.0176747677072715, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 37.96768569946289, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5703125, "step": 348, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991170167922974 }, { "episode": 5600, "epoch": 0.033552623695342176, "loss/policy_avg": 0.5422201156616211, "lr": 9.776968302658488e-06, "objective/entropy": 80.41102600097656, "objective/kl": 34.64447021484375, "objective/non_score_reward": -1.7322235107421875, "objective/rlhf_reward": -5.478295783610687, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 117.23408508300781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.54296875, "step": 349, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983158111572266 }, { "episode": 5616, "epoch": 0.033648488334471725, "loss/policy_avg": 0.3756037950515747, "lr": 9.776329243353783e-06, "objective/entropy": 61.65838623046875, "objective/kl": 44.269325256347656, "objective/non_score_reward": -2.213466167449951, "objective/rlhf_reward": -7.40326676806961, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 19.3502254486084, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.583984375, "step": 350, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988865852355957 }, { "episode": 5632, "epoch": 0.033744352973601274, "loss/policy_avg": 0.9775654673576355, "lr": 9.77569018404908e-06, "objective/entropy": 57.90337371826172, "objective/kl": 41.80830383300781, "objective/non_score_reward": -2.0904150009155273, "objective/rlhf_reward": -6.628326908747354, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 84.0235824584961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.84765625, "step": 351, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9960914850234985 }, { "episode": 5648, "epoch": 0.03384021761273082, "loss/policy_avg": -0.20816992223262787, "lr": 9.775051124744377e-06, "objective/entropy": -118.41542053222656, "objective/kl": 23.201061248779297, "objective/non_score_reward": -1.160053014755249, "objective/rlhf_reward": -2.8153834894028416, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.062729835510254, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.4453125, "step": 352, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0035219192504883 }, { "episode": 5664, "epoch": 0.03393608225186037, "loss/policy_avg": 0.035901207476854324, "lr": 9.774412065439674e-06, "objective/entropy": 154.33920288085938, "objective/kl": 28.773828506469727, "objective/non_score_reward": -1.4386913776397705, "objective/rlhf_reward": -2.8310468539011207, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 25.329944610595703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4755859375, "step": 353, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0003747940063477 }, { "episode": 5680, "epoch": 0.03403194689098992, "loss/policy_avg": 0.7185342311859131, "lr": 9.77377300613497e-06, "objective/entropy": 45.80010986328125, "objective/kl": 35.51177215576172, "objective/non_score_reward": -1.7755887508392334, "objective/rlhf_reward": -5.586583339961704, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 69.95939636230469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.51953125, "step": 354, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996915340423584 }, { "episode": 5696, "epoch": 0.03412781153011947, "loss/policy_avg": 0.871320903301239, "lr": 9.773133946830267e-06, "objective/entropy": 136.34942626953125, "objective/kl": 37.25979995727539, "objective/non_score_reward": -1.862990140914917, "objective/rlhf_reward": -5.504549334721501, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 58.879180908203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 355, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990177154541016 }, { "episode": 5712, "epoch": 0.03422367616924902, "loss/policy_avg": 0.14556461572647095, "lr": 9.772494887525563e-06, "objective/entropy": -10.28516960144043, "objective/kl": 29.231609344482422, "objective/non_score_reward": -1.461580514907837, "objective/rlhf_reward": -4.021493013176035, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 39.2762451171875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 356, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985218048095703 }, { "episode": 5728, "epoch": 0.03431954080837857, "loss/policy_avg": 0.27659082412719727, "lr": 9.77185582822086e-06, "objective/entropy": -36.31108093261719, "objective/kl": 32.386661529541016, "objective/non_score_reward": -1.619333028793335, "objective/rlhf_reward": -6.47733199596405, "objective/scores": 0.0, "policy/approxkl_avg": 10.265704154968262, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71484375, "step": 357, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992687702178955 }, { "episode": 5744, "epoch": 0.03441540544750812, "loss/policy_avg": 0.10546956956386566, "lr": 9.771216768916156e-06, "objective/entropy": 79.19872283935547, "objective/kl": 22.353626251220703, "objective/non_score_reward": -1.1176813840866089, "objective/rlhf_reward": -3.0201275154069513, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 20.73809051513672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4873046875, "step": 358, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9974703788757324 }, { "episode": 5760, "epoch": 0.034511270086637666, "loss/policy_avg": 0.5648351311683655, "lr": 9.770577709611453e-06, "objective/entropy": 38.47356033325195, "objective/kl": 23.87390899658203, "objective/non_score_reward": -1.1936955451965332, "objective/rlhf_reward": -3.4331463485056455, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.14659595489502, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53125, "step": 359, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0011978149414062 }, { "episode": 5776, "epoch": 0.034607134725767215, "loss/policy_avg": 0.5912380814552307, "lr": 9.76993865030675e-06, "objective/entropy": 116.97152709960938, "objective/kl": 40.231689453125, "objective/non_score_reward": -2.011584758758545, "objective/rlhf_reward": -6.565385702069163, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 117.33955383300781, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.666015625, "step": 360, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9969582557678223 }, { "episode": 5792, "epoch": 0.034702999364896764, "loss/policy_avg": -0.019477106630802155, "lr": 9.769299591002045e-06, "objective/entropy": -144.96791076660156, "objective/kl": 27.773448944091797, "objective/non_score_reward": -1.3886725902557373, "objective/rlhf_reward": -5.554690062999725, "objective/scores": 0.0, "policy/approxkl_avg": 7.48216438293457, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626953125, "step": 361, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000136137008667 }, { "episode": 5808, "epoch": 0.03479886400402631, "loss/policy_avg": -0.5155759453773499, "lr": 9.768660531697342e-06, "objective/entropy": 78.00074768066406, "objective/kl": 34.501590728759766, "objective/non_score_reward": -1.7250795364379883, "objective/rlhf_reward": -5.521715917674404, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 122.40145874023438, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.513671875, "step": 362, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.02427077293396 }, { "episode": 5824, "epoch": 0.03489472864315586, "loss/policy_avg": 0.3520805537700653, "lr": 9.768021472392639e-06, "objective/entropy": -66.29779815673828, "objective/kl": 23.767650604248047, "objective/non_score_reward": -1.188382625579834, "objective/rlhf_reward": -3.3535303235054013, "objective/scores": 0.35, "policy/approxkl_avg": 66.86349487304688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.51953125, "step": 363, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9973565340042114 }, { "episode": 5840, "epoch": 0.03499059328228541, "loss/policy_avg": 0.25808075070381165, "lr": 9.767382413087936e-06, "objective/entropy": 55.69321060180664, "objective/kl": 32.73713684082031, "objective/non_score_reward": -1.6368569135665894, "objective/rlhf_reward": -4.147427594661712, "objective/scores": 0.6, "policy/approxkl_avg": 17.00968360900879, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 364, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998908519744873 }, { "episode": 5856, "epoch": 0.03508645792141496, "loss/policy_avg": -0.33678027987480164, "lr": 9.766743353783233e-06, "objective/entropy": 63.459205627441406, "objective/kl": 36.74503707885742, "objective/non_score_reward": -1.837251901626587, "objective/rlhf_reward": -5.226301344410453, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 63.5507926940918, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7734375, "step": 365, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0005531311035156 }, { "episode": 5872, "epoch": 0.03518232256054451, "loss/policy_avg": 0.397920161485672, "lr": 9.76610429447853e-06, "objective/entropy": -11.37314224243164, "objective/kl": 32.99299240112305, "objective/non_score_reward": -1.6496496200561523, "objective/rlhf_reward": -5.174766202171413, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 28.19782257080078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.65625, "step": 366, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984650611877441 }, { "episode": 5888, "epoch": 0.03527818719967406, "loss/policy_avg": 0.5101684331893921, "lr": 9.765465235173825e-06, "objective/entropy": 122.12913513183594, "objective/kl": 39.20099639892578, "objective/non_score_reward": -1.9600497484207153, "objective/rlhf_reward": -6.480949008201046, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 10.180255889892578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.45703125, "step": 367, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9976162910461426 }, { "episode": 5904, "epoch": 0.03537405183880361, "loss/policy_avg": -0.46757811307907104, "lr": 9.764826175869122e-06, "objective/entropy": -108.47764587402344, "objective/kl": 25.862443923950195, "objective/non_score_reward": -1.2931220531463623, "objective/rlhf_reward": -3.6162289073138982, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.3750016689300537, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.703125, "step": 368, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0032150745391846 }, { "episode": 5920, "epoch": 0.035469916477933155, "loss/policy_avg": 0.12928390502929688, "lr": 9.764187116564417e-06, "objective/entropy": 47.25078201293945, "objective/kl": 23.20449447631836, "objective/non_score_reward": -1.1602246761322021, "objective/rlhf_reward": -2.240898942947388, "objective/scores": 0.6, "policy/approxkl_avg": 2.1992838382720947, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.544921875, "step": 369, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0065484046936035 }, { "episode": 5936, "epoch": 0.035565781117062704, "loss/policy_avg": 0.15939241647720337, "lr": 9.763548057259714e-06, "objective/entropy": -19.609264373779297, "objective/kl": 28.25977325439453, "objective/non_score_reward": -1.4129884243011475, "objective/rlhf_reward": -4.273351618138653, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 59.99807357788086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 370, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0036120414733887 }, { "episode": 5952, "epoch": 0.03566164575619225, "loss/policy_avg": 0.1767190843820572, "lr": 9.76290899795501e-06, "objective/entropy": -11.536600112915039, "objective/kl": 36.28870391845703, "objective/non_score_reward": -1.8144354820251465, "objective/rlhf_reward": -7.257741451263428, "objective/scores": 0.0, "policy/approxkl_avg": 11.846475601196289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 371, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99981689453125 }, { "episode": 5968, "epoch": 0.0357575103953218, "loss/policy_avg": 0.3314260244369507, "lr": 9.762269938650308e-06, "objective/entropy": -30.279476165771484, "objective/kl": 28.756494522094727, "objective/non_score_reward": -1.4378247261047363, "objective/rlhf_reward": -4.300700943084106, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 39.342529296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.623046875, "step": 372, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998626708984375 }, { "episode": 5984, "epoch": 0.03585337503445135, "loss/policy_avg": 0.18494009971618652, "lr": 9.761630879345604e-06, "objective/entropy": 68.65098571777344, "objective/kl": 36.555747985839844, "objective/non_score_reward": -1.8277872800827026, "objective/rlhf_reward": -5.486320610317301, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.346623420715332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.517578125, "step": 373, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000062942504883 }, { "episode": 6000, "epoch": 0.03594923967358091, "loss/policy_avg": -0.05254024267196655, "lr": 9.7609918200409e-06, "objective/entropy": -30.816913604736328, "objective/kl": 26.80430793762207, "objective/non_score_reward": -1.3402154445648193, "objective/rlhf_reward": -3.845090114864048, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 3.3415722846984863, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.583984375, "step": 374, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991049766540527 }, { "episode": 6016, "epoch": 0.036045104312710456, "loss/policy_avg": 0.48673489689826965, "lr": 9.760352760736196e-06, "objective/entropy": -54.172760009765625, "objective/kl": 26.726612091064453, "objective/non_score_reward": -1.3363306522369385, "objective/rlhf_reward": -0.945322489738464, "objective/scores": 1.1, "policy/approxkl_avg": 36.357444763183594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.708984375, "step": 375, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999312400817871 }, { "episode": 6032, "epoch": 0.036140968951840005, "loss/policy_avg": -0.06733483076095581, "lr": 9.759713701431493e-06, "objective/entropy": 135.20721435546875, "objective/kl": 37.13209915161133, "objective/non_score_reward": -1.856605052947998, "objective/rlhf_reward": -4.5027009590875835, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 17.97521209716797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4873046875, "step": 376, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002077341079712 }, { "episode": 6048, "epoch": 0.036236833590969554, "loss/policy_avg": -0.041654448956251144, "lr": 9.75907464212679e-06, "objective/entropy": -167.12548828125, "objective/kl": 25.773399353027344, "objective/non_score_reward": -1.2886700630187988, "objective/rlhf_reward": -0.7546801328659054, "objective/scores": 1.1, "policy/approxkl_avg": 0.800922691822052, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.521484375, "step": 377, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000338315963745 }, { "episode": 6064, "epoch": 0.0363326982300991, "loss/policy_avg": 0.03024141490459442, "lr": 9.758435582822087e-06, "objective/entropy": -73.82417297363281, "objective/kl": 26.33017349243164, "objective/non_score_reward": -1.3165086507797241, "objective/rlhf_reward": -3.14332831122068, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 23.01593780517578, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.63671875, "step": 378, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002521276473999 }, { "episode": 6080, "epoch": 0.03642856286922865, "loss/policy_avg": 0.285569429397583, "lr": 9.757796523517384e-06, "objective/entropy": -111.42575073242188, "objective/kl": 28.885374069213867, "objective/non_score_reward": -1.4442687034606934, "objective/rlhf_reward": -4.398472824183804, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 79.57511901855469, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.61328125, "step": 379, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979077577590942 }, { "episode": 6096, "epoch": 0.0365244275083582, "loss/policy_avg": -0.022392742335796356, "lr": 9.75715746421268e-06, "objective/entropy": -79.86695098876953, "objective/kl": 17.694236755371094, "objective/non_score_reward": -0.8847118616104126, "objective/rlhf_reward": -1.7140187576142063, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.339657306671143, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.767578125, "step": 380, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0013508796691895 }, { "episode": 6112, "epoch": 0.03662029214748775, "loss/policy_avg": 0.4459357261657715, "lr": 9.756518404907976e-06, "objective/entropy": -148.62872314453125, "objective/kl": 21.098934173583984, "objective/non_score_reward": -1.054946780204773, "objective/rlhf_reward": 0.1802129983901981, "objective/scores": 1.1, "policy/approxkl_avg": 6.359186172485352, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.576171875, "step": 381, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992458820343018 }, { "episode": 6128, "epoch": 0.0367161567866173, "loss/policy_avg": -0.012147974222898483, "lr": 9.755879345603273e-06, "objective/entropy": 152.35232543945312, "objective/kl": 31.486684799194336, "objective/non_score_reward": -1.5743342638015747, "objective/rlhf_reward": -3.3736180409204692, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 35.51153564453125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 382, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999483585357666 }, { "episode": 6144, "epoch": 0.03681202142574685, "loss/policy_avg": 0.012859173119068146, "lr": 9.75524028629857e-06, "objective/entropy": 26.343887329101562, "objective/kl": 33.34328079223633, "objective/non_score_reward": -1.6671642065048218, "objective/rlhf_reward": -4.721245358662541, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 22.614994049072266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.82421875, "step": 383, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0018911361694336 }, { "episode": 6160, "epoch": 0.0369078860648764, "loss/policy_avg": 0.21653258800506592, "lr": 9.754601226993867e-06, "objective/entropy": 109.49678039550781, "objective/kl": 43.73469543457031, "objective/non_score_reward": -2.186734676361084, "objective/rlhf_reward": -7.296340326876983, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 31.000137329101562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.689453125, "step": 384, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001706123352051 }, { "episode": 6176, "epoch": 0.037003750704005946, "loss/policy_avg": 0.17637991905212402, "lr": 9.753962167689162e-06, "objective/entropy": -57.256038665771484, "objective/kl": 20.548786163330078, "objective/non_score_reward": -1.0274393558502197, "objective/rlhf_reward": -1.9870514891305304, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.945226669311523, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.71484375, "step": 385, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000217914581299 }, { "episode": 6192, "epoch": 0.037099615343135495, "loss/policy_avg": 0.23474755883216858, "lr": 9.753323108384459e-06, "objective/entropy": -67.67970275878906, "objective/kl": 29.886417388916016, "objective/non_score_reward": -1.4943209886550903, "objective/rlhf_reward": -4.461511933597263, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 30.2872314453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.517578125, "step": 386, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9968770742416382 }, { "episode": 6208, "epoch": 0.037195479982265044, "loss/policy_avg": 3.0326309204101562, "lr": 9.752684049079756e-06, "objective/entropy": -30.304298400878906, "objective/kl": 34.21199035644531, "objective/non_score_reward": -1.710599660873413, "objective/rlhf_reward": -5.391800324530944, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 191.66567993164062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62109375, "step": 387, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998448133468628 }, { "episode": 6224, "epoch": 0.03729134462139459, "loss/policy_avg": 0.020012550055980682, "lr": 9.752044989775053e-06, "objective/entropy": -44.4876594543457, "objective/kl": 30.23657989501953, "objective/non_score_reward": -1.5118290185928345, "objective/rlhf_reward": -4.099904905037816, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 21.57486915588379, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.767578125, "step": 388, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002869129180908 }, { "episode": 6240, "epoch": 0.03738720926052414, "loss/policy_avg": 0.33562996983528137, "lr": 9.751405930470348e-06, "objective/entropy": -154.47891235351562, "objective/kl": 18.6168155670166, "objective/non_score_reward": -0.9308407306671143, "objective/rlhf_reward": -3.723362982273102, "objective/scores": 0.0, "policy/approxkl_avg": 13.14146614074707, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65625, "step": 389, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002055168151855 }, { "episode": 6256, "epoch": 0.03748307389965369, "loss/policy_avg": 0.037651438266038895, "lr": 9.750766871165645e-06, "objective/entropy": -6.050981521606445, "objective/kl": 26.29869270324707, "objective/non_score_reward": -1.3149347305297852, "objective/rlhf_reward": -5.25973904132843, "objective/scores": 0.0, "policy/approxkl_avg": 27.001697540283203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.759765625, "step": 390, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982492923736572 }, { "episode": 6272, "epoch": 0.03757893853878324, "loss/policy_avg": 0.1277342140674591, "lr": 9.750127811860941e-06, "objective/entropy": -114.59310913085938, "objective/kl": 33.31782531738281, "objective/non_score_reward": -1.6658912897109985, "objective/rlhf_reward": -3.739846025348875, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 30.69461441040039, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.755859375, "step": 391, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998853206634521 }, { "episode": 6288, "epoch": 0.03767480317791279, "loss/policy_avg": 0.08161749690771103, "lr": 9.749488752556238e-06, "objective/entropy": 28.02770233154297, "objective/kl": 25.580188751220703, "objective/non_score_reward": -1.279009461402893, "objective/rlhf_reward": -3.6654397054627985, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 11.5637845993042, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3642578125, "step": 392, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9969701766967773 }, { "episode": 6304, "epoch": 0.03777066781704234, "loss/policy_avg": 0.013617899268865585, "lr": 9.748849693251534e-06, "objective/entropy": 137.66958618164062, "objective/kl": 36.88829040527344, "objective/non_score_reward": -1.8444143533706665, "objective/rlhf_reward": -5.999055602637631, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.8839926719665527, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.703125, "step": 393, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998722076416016 }, { "episode": 6320, "epoch": 0.037866532456171886, "loss/policy_avg": 0.7664667963981628, "lr": 9.74821063394683e-06, "objective/entropy": 12.1875, "objective/kl": 27.703767776489258, "objective/non_score_reward": -1.385188341140747, "objective/rlhf_reward": -4.181503379081173, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 29.00311279296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 394, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983487129211426 }, { "episode": 6336, "epoch": 0.037962397095301435, "loss/policy_avg": 0.13891640305519104, "lr": 9.747571574642127e-06, "objective/entropy": -52.291236877441406, "objective/kl": 29.62856101989746, "objective/non_score_reward": -1.4814281463623047, "objective/rlhf_reward": -4.10088383701713, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 45.48643112182617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.763671875, "step": 395, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9967341423034668 }, { "episode": 6352, "epoch": 0.038058261734430984, "loss/policy_avg": -0.5259265899658203, "lr": 9.746932515337424e-06, "objective/entropy": -14.848602294921875, "objective/kl": 36.51825714111328, "objective/non_score_reward": -1.8259128332138062, "objective/rlhf_reward": -5.180944981352363, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 133.36766052246094, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.673828125, "step": 396, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.008728504180908 }, { "episode": 6368, "epoch": 0.03815412637356053, "loss/policy_avg": 0.1340530812740326, "lr": 9.746293456032721e-06, "objective/entropy": -13.48861312866211, "objective/kl": 24.147233963012695, "objective/non_score_reward": -1.2073616981506348, "objective/rlhf_reward": -3.0961134592692057, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.865433216094971, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.787109375, "step": 397, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0002119541168213 }, { "episode": 6384, "epoch": 0.03824999101269008, "loss/policy_avg": 0.036313191056251526, "lr": 9.745654396728016e-06, "objective/entropy": -118.45596313476562, "objective/kl": 26.90463638305664, "objective/non_score_reward": -1.3452317714691162, "objective/rlhf_reward": -3.5560982182350864, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.153594017028809, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.708984375, "step": 398, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009984970092773 }, { "episode": 6400, "epoch": 0.03834585565181963, "loss/policy_avg": 0.07543957978487015, "lr": 9.745015337423313e-06, "objective/entropy": 5.307586669921875, "objective/kl": 29.030933380126953, "objective/non_score_reward": -1.4515466690063477, "objective/rlhf_reward": -2.88246778094885, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 14.018705368041992, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.611328125, "step": 399, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984686374664307 }, { "episode": 6416, "epoch": 0.03844172029094918, "loss/policy_avg": 0.11864852905273438, "lr": 9.74437627811861e-06, "objective/entropy": 10.484695434570312, "objective/kl": 24.462554931640625, "objective/non_score_reward": -1.2231277227401733, "objective/rlhf_reward": -3.376739227565464, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 14.423017501831055, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.685546875, "step": 400, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988946914672852 }, { "episode": 6432, "epoch": 0.03853758493007873, "loss/policy_avg": -0.036792345345020294, "lr": 9.743737218813907e-06, "objective/entropy": -181.87400817871094, "objective/kl": 23.07555389404297, "objective/non_score_reward": -1.153777837753296, "objective/rlhf_reward": -3.191279132564632, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 20.132736206054688, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.708984375, "step": 401, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00081729888916 }, { "episode": 6448, "epoch": 0.03863344956920828, "loss/policy_avg": 0.22927281260490417, "lr": 9.743098159509204e-06, "objective/entropy": -88.96450805664062, "objective/kl": 32.569129943847656, "objective/non_score_reward": -1.6284565925598145, "objective/rlhf_reward": -4.780492917696634, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 49.499900817871094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 402, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982388019561768 }, { "episode": 6464, "epoch": 0.03872931420833783, "loss/policy_avg": 0.30984753370285034, "lr": 9.7424591002045e-06, "objective/entropy": -18.365474700927734, "objective/kl": 31.77776336669922, "objective/non_score_reward": -1.5888882875442505, "objective/rlhf_reward": -5.030040267735643, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 36.973690032958984, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.607421875, "step": 403, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9970709085464478 }, { "episode": 6480, "epoch": 0.038825178847467376, "loss/policy_avg": 0.06557717174291611, "lr": 9.741820040899796e-06, "objective/entropy": -141.13568115234375, "objective/kl": 28.107177734375, "objective/non_score_reward": -1.405358910560608, "objective/rlhf_reward": -3.674024294094975, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 31.192813873291016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.791015625, "step": 404, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9952688217163086 }, { "episode": 6496, "epoch": 0.038921043486596925, "loss/policy_avg": 0.05502002686262131, "lr": 9.741180981595093e-06, "objective/entropy": 32.80726623535156, "objective/kl": 44.297119140625, "objective/non_score_reward": -2.2148561477661133, "objective/rlhf_reward": -7.5001741287454795, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 19.57358169555664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.650390625, "step": 405, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999144077301025 }, { "episode": 6512, "epoch": 0.039016908125726474, "loss/policy_avg": 0.026680059731006622, "lr": 9.74054192229039e-06, "objective/entropy": 119.29817962646484, "objective/kl": 39.39287567138672, "objective/non_score_reward": -1.9696437120437622, "objective/rlhf_reward": -6.536938837080627, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 0.6370775699615479, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6171875, "step": 406, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0018253326416016 }, { "episode": 6528, "epoch": 0.03911277276485602, "loss/policy_avg": 0.6271831393241882, "lr": 9.739902862985686e-06, "objective/entropy": 6.752727508544922, "objective/kl": 25.43050765991211, "objective/non_score_reward": -1.2715253829956055, "objective/rlhf_reward": -5.086101770401001, "objective/scores": 0.0, "policy/approxkl_avg": 17.81015396118164, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.74609375, "step": 407, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977548122406006 }, { "episode": 6544, "epoch": 0.03920863740398557, "loss/policy_avg": 0.30096232891082764, "lr": 9.739263803680983e-06, "objective/entropy": -24.516462326049805, "objective/kl": 38.53913116455078, "objective/non_score_reward": -1.9269566535949707, "objective/rlhf_reward": -5.585120143667732, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 15.016406059265137, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.603515625, "step": 408, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.994694709777832 }, { "episode": 6560, "epoch": 0.03930450204311512, "loss/policy_avg": 0.03762083500623703, "lr": 9.73862474437628e-06, "objective/entropy": -218.5489501953125, "objective/kl": 26.699615478515625, "objective/non_score_reward": -1.3349807262420654, "objective/rlhf_reward": -3.6780635170346363, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 59.4561653137207, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.54296875, "step": 409, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982738494873047 }, { "episode": 6576, "epoch": 0.03940036668224467, "loss/policy_avg": 0.2932765483856201, "lr": 9.737985685071575e-06, "objective/entropy": -25.477672576904297, "objective/kl": 35.529788970947266, "objective/non_score_reward": -1.776489496231079, "objective/rlhf_reward": -5.372624413172403, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 39.98287582397461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.55859375, "step": 410, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999699354171753 }, { "episode": 6592, "epoch": 0.03949623132137422, "loss/policy_avg": -0.2486688196659088, "lr": 9.737346625766872e-06, "objective/entropy": -12.952373504638672, "objective/kl": 33.62919616699219, "objective/non_score_reward": -1.681459903717041, "objective/rlhf_reward": -4.778428207116063, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 28.626731872558594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.56640625, "step": 411, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.003122568130493 }, { "episode": 6608, "epoch": 0.03959209596050377, "loss/policy_avg": 0.3249208629131317, "lr": 9.736707566462167e-06, "objective/entropy": -52.927459716796875, "objective/kl": 33.82263946533203, "objective/non_score_reward": -1.6911320686340332, "objective/rlhf_reward": -4.364528393745422, "objective/scores": 0.6, "policy/approxkl_avg": 41.674591064453125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.791015625, "step": 412, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000582695007324 }, { "episode": 6624, "epoch": 0.039687960599633317, "loss/policy_avg": 0.15019002556800842, "lr": 9.736068507157464e-06, "objective/entropy": -22.71458625793457, "objective/kl": 32.99541473388672, "objective/non_score_reward": -1.6497704982757568, "objective/rlhf_reward": -5.257446458845763, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 6.256417274475098, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484375, "step": 413, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978971481323242 }, { "episode": 6640, "epoch": 0.039783825238762865, "loss/policy_avg": 0.296099990606308, "lr": 9.735429447852761e-06, "objective/entropy": -10.485054016113281, "objective/kl": 28.53786277770996, "objective/non_score_reward": -1.4268931150436401, "objective/rlhf_reward": -3.9742393652598063, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.458545684814453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.703125, "step": 414, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996311664581299 }, { "episode": 6656, "epoch": 0.039879689877892414, "loss/policy_avg": 0.3615366816520691, "lr": 9.734790388548058e-06, "objective/entropy": -102.9046859741211, "objective/kl": 19.901390075683594, "objective/non_score_reward": -0.9950695037841797, "objective/rlhf_reward": -2.3184185675984486, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 18.427024841308594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.82421875, "step": 415, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999051809310913 }, { "episode": 6672, "epoch": 0.03997555451702196, "loss/policy_avg": 0.14772659540176392, "lr": 9.734151329243355e-06, "objective/entropy": -148.49395751953125, "objective/kl": 26.190744400024414, "objective/non_score_reward": -1.3095372915267944, "objective/rlhf_reward": -3.6340291834512524, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 59.936073303222656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.806640625, "step": 416, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001544237136841 }, { "episode": 6688, "epoch": 0.04007141915615151, "loss/policy_avg": 0.23557257652282715, "lr": 9.73351226993865e-06, "objective/entropy": -145.32284545898438, "objective/kl": 30.992046356201172, "objective/non_score_reward": -1.5496025085449219, "objective/rlhf_reward": -3.7984096765518185, "objective/scores": 0.6, "policy/approxkl_avg": 7.065143585205078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.904296875, "step": 417, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989118576049805 }, { "episode": 6704, "epoch": 0.04016728379528106, "loss/policy_avg": 0.12179827690124512, "lr": 9.732873210633947e-06, "objective/entropy": -64.65836334228516, "objective/kl": 35.22796630859375, "objective/non_score_reward": -1.7613983154296875, "objective/rlhf_reward": -5.686343335841579, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 61.170570373535156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5546875, "step": 418, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985511302947998 }, { "episode": 6720, "epoch": 0.04026314843441061, "loss/policy_avg": -0.043803490698337555, "lr": 9.732234151329244e-06, "objective/entropy": -87.70707702636719, "objective/kl": 28.95832061767578, "objective/non_score_reward": -1.447916030883789, "objective/rlhf_reward": -4.275892340930637, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.8885676860809326, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.763671875, "step": 419, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9999752044677734 }, { "episode": 6736, "epoch": 0.04035901307354016, "loss/policy_avg": 0.18042519688606262, "lr": 9.73159509202454e-06, "objective/entropy": -4.936176300048828, "objective/kl": 30.613567352294922, "objective/non_score_reward": -1.5306785106658936, "objective/rlhf_reward": -4.722713804244995, "objective/scores": 0.35, "policy/approxkl_avg": 209.10888671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.619140625, "step": 420, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993661642074585 }, { "episode": 6752, "epoch": 0.04045487771266971, "loss/policy_avg": 0.6567588448524475, "lr": 9.730956032719838e-06, "objective/entropy": -162.10116577148438, "objective/kl": 33.140079498291016, "objective/non_score_reward": -1.6570039987564087, "objective/rlhf_reward": -4.505309881941352, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 33.703067779541016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7578125, "step": 421, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9999632835388184 }, { "episode": 6768, "epoch": 0.04055074235179926, "loss/policy_avg": 0.5961964130401611, "lr": 9.730316973415135e-06, "objective/entropy": 18.374740600585938, "objective/kl": 36.82442092895508, "objective/non_score_reward": -1.8412209749221802, "objective/rlhf_reward": -4.441164646984312, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 62.1960334777832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.654296875, "step": 422, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999306201934814 }, { "episode": 6784, "epoch": 0.040646606990928806, "loss/policy_avg": 0.19755011796951294, "lr": 9.72967791411043e-06, "objective/entropy": -57.290000915527344, "objective/kl": 30.764808654785156, "objective/non_score_reward": -1.5382404327392578, "objective/rlhf_reward": -4.811326077490478, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 37.60175323486328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.67578125, "step": 423, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990873336791992 }, { "episode": 6800, "epoch": 0.040742471630058355, "loss/policy_avg": 0.2760317325592041, "lr": 9.729038854805727e-06, "objective/entropy": -54.2406005859375, "objective/kl": 28.681961059570312, "objective/non_score_reward": -1.4340981245040894, "objective/rlhf_reward": -3.7889812094735458, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 13.514376640319824, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.548828125, "step": 424, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0004029273986816 }, { "episode": 6816, "epoch": 0.040838336269187904, "loss/policy_avg": 0.05885821580886841, "lr": 9.728399795501023e-06, "objective/entropy": -30.280364990234375, "objective/kl": 31.102825164794922, "objective/non_score_reward": -1.5551413297653198, "objective/rlhf_reward": -4.820565319061279, "objective/scores": 0.35, "policy/approxkl_avg": 61.290470123291016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.767578125, "step": 425, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986295700073242 }, { "episode": 6832, "epoch": 0.04093420090831745, "loss/policy_avg": 0.044344570487737656, "lr": 9.72776073619632e-06, "objective/entropy": -223.16510009765625, "objective/kl": 11.546382904052734, "objective/non_score_reward": -0.5773191452026367, "objective/rlhf_reward": -0.3618654114770252, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.5684561729431152, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7109375, "step": 426, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0002474784851074 }, { "episode": 6848, "epoch": 0.041030065547447, "loss/policy_avg": 0.11938305199146271, "lr": 9.727121676891617e-06, "objective/entropy": -84.6756362915039, "objective/kl": 32.253173828125, "objective/non_score_reward": -1.6126585006713867, "objective/rlhf_reward": -5.000035624118194, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 54.86524963378906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.693359375, "step": 427, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985135793685913 }, { "episode": 6864, "epoch": 0.04112593018657655, "loss/policy_avg": -0.02704887092113495, "lr": 9.726482617586912e-06, "objective/entropy": 61.31664276123047, "objective/kl": 50.535186767578125, "objective/non_score_reward": -2.526759624481201, "objective/rlhf_reward": -8.765402606039672, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 87.70621490478516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4794921875, "step": 428, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0009117126464844 }, { "episode": 6880, "epoch": 0.0412217948257061, "loss/policy_avg": 0.3563253581523895, "lr": 9.72584355828221e-06, "objective/entropy": -201.59555053710938, "objective/kl": 26.542133331298828, "objective/non_score_reward": -1.3271067142486572, "objective/rlhf_reward": -2.384707783104154, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.606565475463867, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.60546875, "step": 429, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991586208343506 }, { "episode": 6896, "epoch": 0.04131765946483565, "loss/policy_avg": 0.3849369287490845, "lr": 9.725204498977506e-06, "objective/entropy": -172.11151123046875, "objective/kl": 31.27842140197754, "objective/non_score_reward": -1.5639209747314453, "objective/rlhf_reward": -4.52235098282496, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 35.41864776611328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.794921875, "step": 430, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9977457523345947 }, { "episode": 6912, "epoch": 0.0414135241039652, "loss/policy_avg": 0.5410929918289185, "lr": 9.724565439672803e-06, "objective/entropy": -53.43696594238281, "objective/kl": 36.75939178466797, "objective/non_score_reward": -1.8379695415496826, "objective/rlhf_reward": -5.229172053114448, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.017414093017578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.51953125, "step": 431, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9961919784545898 }, { "episode": 6928, "epoch": 0.041509388743094754, "loss/policy_avg": 0.5185568332672119, "lr": 9.7239263803681e-06, "objective/entropy": -42.49586486816406, "objective/kl": 31.465147018432617, "objective/non_score_reward": -1.5732574462890625, "objective/rlhf_reward": -4.914427437869412, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.669852614402771, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62109375, "step": 432, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998645782470703 }, { "episode": 6944, "epoch": 0.0416052533822243, "loss/policy_avg": -0.09886922687292099, "lr": 9.723287321063397e-06, "objective/entropy": -182.28286743164062, "objective/kl": 27.1431884765625, "objective/non_score_reward": -1.3571594953536987, "objective/rlhf_reward": -3.6953046480814615, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 25.096237182617188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 433, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0018882751464844 }, { "episode": 6960, "epoch": 0.04170111802135385, "loss/policy_avg": 0.39349502325057983, "lr": 9.722648261758692e-06, "objective/entropy": 28.20358657836914, "objective/kl": 38.92597198486328, "objective/non_score_reward": -1.946298599243164, "objective/rlhf_reward": -6.385194158554077, "objective/scores": 0.35, "policy/approxkl_avg": 46.153385162353516, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4912109375, "step": 434, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992406368255615 }, { "episode": 6976, "epoch": 0.0417969826604834, "loss/policy_avg": 0.3586619198322296, "lr": 9.722009202453989e-06, "objective/entropy": -126.02680206298828, "objective/kl": 32.40974807739258, "objective/non_score_reward": -1.6204874515533447, "objective/rlhf_reward": -4.534538338856633, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 10.944326400756836, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.666015625, "step": 435, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971150159835815 }, { "episode": 6992, "epoch": 0.04189284729961295, "loss/policy_avg": -0.4687817692756653, "lr": 9.721370143149284e-06, "objective/entropy": -69.42359924316406, "objective/kl": 20.10685157775879, "objective/non_score_reward": -1.0053426027297974, "objective/rlhf_reward": -2.6427683430291236, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 22.483867645263672, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6328125, "step": 436, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.041172504425049 }, { "episode": 7008, "epoch": 0.0419887119387425, "loss/policy_avg": 0.0906272605061531, "lr": 9.720731083844581e-06, "objective/entropy": -149.47274780273438, "objective/kl": 26.28115463256836, "objective/non_score_reward": -1.3140578269958496, "objective/rlhf_reward": -3.1335249564805365, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.7223973274230957, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.599609375, "step": 437, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000173568725586 }, { "episode": 7024, "epoch": 0.04208457657787205, "loss/policy_avg": 0.3348531126976013, "lr": 9.720092024539878e-06, "objective/entropy": 22.56686782836914, "objective/kl": 36.523582458496094, "objective/non_score_reward": -1.8261791467666626, "objective/rlhf_reward": -5.700596723620015, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 20.443164825439453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.515625, "step": 438, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979515075683594 }, { "episode": 7040, "epoch": 0.042180441217001596, "loss/policy_avg": 0.04725319519639015, "lr": 9.719452965235175e-06, "objective/entropy": -71.08361053466797, "objective/kl": 20.915573120117188, "objective/non_score_reward": -1.045778751373291, "objective/rlhf_reward": -2.0604087731995917, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.088305473327637, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4462890625, "step": 439, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0024590492248535 }, { "episode": 7056, "epoch": 0.042276305856131145, "loss/policy_avg": 0.18381188809871674, "lr": 9.718813905930472e-06, "objective/entropy": 25.569873809814453, "objective/kl": 38.07762145996094, "objective/non_score_reward": -1.9038809537887573, "objective/rlhf_reward": -3.215523815155029, "objective/scores": 1.1, "policy/approxkl_avg": 30.962854385375977, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62890625, "step": 440, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0011448860168457 }, { "episode": 7072, "epoch": 0.042372170495260694, "loss/policy_avg": 0.1967303454875946, "lr": 9.718174846625767e-06, "objective/entropy": -103.38803100585938, "objective/kl": 29.222076416015625, "objective/non_score_reward": -1.4611037969589233, "objective/rlhf_reward": -4.240295205179768, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 8.899417877197266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.689453125, "step": 441, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986486434936523 }, { "episode": 7088, "epoch": 0.04246803513439024, "loss/policy_avg": -0.07635466754436493, "lr": 9.717535787321064e-06, "objective/entropy": -54.58887481689453, "objective/kl": 35.043663024902344, "objective/non_score_reward": -1.752183198928833, "objective/rlhf_reward": -5.527779820378184, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 12.18149185180664, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.580078125, "step": 442, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0013041496276855 }, { "episode": 7104, "epoch": 0.04256389977351979, "loss/policy_avg": 0.3104819059371948, "lr": 9.71689672801636e-06, "objective/entropy": -53.842830657958984, "objective/kl": 23.18008804321289, "objective/non_score_reward": -1.1590044498443604, "objective/rlhf_reward": -3.0797587921291143, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 86.82899475097656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.91796875, "step": 443, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991846084594727 }, { "episode": 7120, "epoch": 0.04265976441264934, "loss/policy_avg": 0.6317604780197144, "lr": 9.716257668711657e-06, "objective/entropy": -21.19356918334961, "objective/kl": 30.069751739501953, "objective/non_score_reward": -1.503487467765808, "objective/rlhf_reward": -4.6353477025903285, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 128.40951538085938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.76953125, "step": 444, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997659683227539 }, { "episode": 7136, "epoch": 0.04275562905177889, "loss/policy_avg": 0.33194229006767273, "lr": 9.715618609406954e-06, "objective/entropy": -102.48907470703125, "objective/kl": 32.374549865722656, "objective/non_score_reward": -1.6187275648117065, "objective/rlhf_reward": -6.474910318851471, "objective/scores": 0.0, "policy/approxkl_avg": 7.681756973266602, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59375, "step": 445, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998705506324768 }, { "episode": 7152, "epoch": 0.04285149369090844, "loss/policy_avg": 0.26850253343582153, "lr": 9.714979550102251e-06, "objective/entropy": 69.35136413574219, "objective/kl": 26.097612380981445, "objective/non_score_reward": -1.3048806190490723, "objective/rlhf_reward": -3.738569977696299, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 62.56462097167969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6640625, "step": 446, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99786376953125 }, { "episode": 7168, "epoch": 0.04294735833003799, "loss/policy_avg": -0.1885017603635788, "lr": 9.714340490797546e-06, "objective/entropy": -16.98421859741211, "objective/kl": 30.90627670288086, "objective/non_score_reward": -1.5453139543533325, "objective/rlhf_reward": -4.577135715548115, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 11.766645431518555, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.73828125, "step": 447, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003262996673584 }, { "episode": 7184, "epoch": 0.04304322296916754, "loss/policy_avg": 0.24147900938987732, "lr": 9.713701431492843e-06, "objective/entropy": -196.87869262695312, "objective/kl": 23.231670379638672, "objective/non_score_reward": -1.161583662033081, "objective/rlhf_reward": -3.1305624780976142, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 19.03369903564453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 448, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996755123138428 }, { "episode": 7200, "epoch": 0.043139087608297086, "loss/policy_avg": 0.3051467537879944, "lr": 9.71306237218814e-06, "objective/entropy": -54.2137565612793, "objective/kl": 33.54918670654297, "objective/non_score_reward": -1.6774592399597168, "objective/rlhf_reward": -5.047977810323822, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 74.37176513671875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.783203125, "step": 449, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9967325925827026 }, { "episode": 7216, "epoch": 0.043234952247426635, "loss/policy_avg": 0.0008301436901092529, "lr": 9.712423312883437e-06, "objective/entropy": -37.864322662353516, "objective/kl": 24.052818298339844, "objective/non_score_reward": -1.2026410102844238, "objective/rlhf_reward": -2.9857349946823826, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 1.6498993635177612, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.525390625, "step": 450, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001569986343384 }, { "episode": 7232, "epoch": 0.043330816886556184, "loss/policy_avg": 0.10217726975679398, "lr": 9.711784253578734e-06, "objective/entropy": -97.12496948242188, "objective/kl": 20.143707275390625, "objective/non_score_reward": -1.007185459136963, "objective/rlhf_reward": -2.669491672252102, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 37.34214401245117, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.783203125, "step": 451, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993760585784912 }, { "episode": 7248, "epoch": 0.04342668152568573, "loss/policy_avg": 0.2181258350610733, "lr": 9.711145194274029e-06, "objective/entropy": -187.07266235351562, "objective/kl": 22.520824432373047, "objective/non_score_reward": -1.1260414123535156, "objective/rlhf_reward": -2.9000454283395585, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 80.40426635742188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.654296875, "step": 452, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000328540802002 }, { "episode": 7264, "epoch": 0.04352254616481528, "loss/policy_avg": 0.28700706362724304, "lr": 9.710506134969326e-06, "objective/entropy": -119.91871643066406, "objective/kl": 30.88311004638672, "objective/non_score_reward": -1.5441553592681885, "objective/rlhf_reward": -4.834986022024779, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 14.897968292236328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.787109375, "step": 453, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9972997903823853 }, { "episode": 7280, "epoch": 0.04361841080394483, "loss/policy_avg": 0.013649387285113335, "lr": 9.709867075664623e-06, "objective/entropy": -137.84861755371094, "objective/kl": 35.624549865722656, "objective/non_score_reward": -1.781227707862854, "objective/rlhf_reward": -7.124910950660706, "objective/scores": 0.0, "policy/approxkl_avg": 77.14759826660156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.712890625, "step": 454, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999969720840454 }, { "episode": 7296, "epoch": 0.04371427544307438, "loss/policy_avg": 0.9055305123329163, "lr": 9.70922801635992e-06, "objective/entropy": -177.1896514892578, "objective/kl": 34.19129943847656, "objective/non_score_reward": -1.7095649242401123, "objective/rlhf_reward": -5.387661199183807, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 51.92662811279297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.66796875, "step": 455, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9976481199264526 }, { "episode": 7312, "epoch": 0.04381014008220393, "loss/policy_avg": -0.14486947655677795, "lr": 9.708588957055215e-06, "objective/entropy": -91.43609619140625, "objective/kl": 30.12580108642578, "objective/non_score_reward": -1.5062901973724365, "objective/rlhf_reward": -4.509388887675938, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 24.85628890991211, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.611328125, "step": 456, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.011446952819824 }, { "episode": 7328, "epoch": 0.04390600472133348, "loss/policy_avg": 0.3115137815475464, "lr": 9.707949897750512e-06, "objective/entropy": -33.496673583984375, "objective/kl": 24.4674072265625, "objective/non_score_reward": -1.2233703136444092, "objective/rlhf_reward": -3.377709650787052, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 9.057685852050781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.751953125, "step": 457, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009520053863525 }, { "episode": 7344, "epoch": 0.044001869360463026, "loss/policy_avg": 1.4892723560333252, "lr": 9.707310838445809e-06, "objective/entropy": -35.618934631347656, "objective/kl": 27.64456558227539, "objective/non_score_reward": -1.3822282552719116, "objective/rlhf_reward": -3.5815017921494796, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.899414300918579, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.57421875, "step": 458, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999825954437256 }, { "episode": 7360, "epoch": 0.044097733999592575, "loss/policy_avg": 0.022264737635850906, "lr": 9.706671779141105e-06, "objective/entropy": 31.060089111328125, "objective/kl": 34.85979461669922, "objective/non_score_reward": -1.7429897785186768, "objective/rlhf_reward": -5.367839369837361, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 7.1077799797058105, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.751953125, "step": 459, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993565082550049 }, { "episode": 7376, "epoch": 0.044193598638722124, "loss/policy_avg": 0.08219340443611145, "lr": 9.7060327198364e-06, "objective/entropy": -69.6414566040039, "objective/kl": 35.42669677734375, "objective/non_score_reward": -1.7713346481323242, "objective/rlhf_reward": -5.726088785861416, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 21.27887535095215, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.712890625, "step": 460, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0088043212890625 }, { "episode": 7392, "epoch": 0.04428946327785167, "loss/policy_avg": 0.03685396909713745, "lr": 9.705393660531698e-06, "objective/entropy": -245.04380798339844, "objective/kl": 21.42380142211914, "objective/non_score_reward": -1.0711899995803833, "objective/rlhf_reward": -2.1620538852372504, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.849046230316162, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.767578125, "step": 461, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.008730411529541 }, { "episode": 7408, "epoch": 0.04438532791698122, "loss/policy_avg": 0.5492111444473267, "lr": 9.704754601226994e-06, "objective/entropy": 9.25466537475586, "objective/kl": 20.997167587280273, "objective/non_score_reward": -1.0498583316802979, "objective/rlhf_reward": -1.2757146700632302, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 36.03380584716797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9296875, "step": 462, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000826120376587 }, { "episode": 7424, "epoch": 0.04448119255611077, "loss/policy_avg": 0.22961178421974182, "lr": 9.704115541922291e-06, "objective/entropy": -2.9236984252929688, "objective/kl": 26.89717674255371, "objective/non_score_reward": -1.3448588848114014, "objective/rlhf_reward": -3.717576061905013, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 133.2696075439453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8984375, "step": 463, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999076008796692 }, { "episode": 7440, "epoch": 0.04457705719524032, "loss/policy_avg": 0.1330358386039734, "lr": 9.703476482617588e-06, "objective/entropy": -155.3049774169922, "objective/kl": 32.32700729370117, "objective/non_score_reward": -1.6163502931594849, "objective/rlhf_reward": -2.0654012918472286, "objective/scores": 1.1, "policy/approxkl_avg": 352.436767578125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.86328125, "step": 464, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9973843097686768 }, { "episode": 7456, "epoch": 0.04467292183436987, "loss/policy_avg": 0.13191767036914825, "lr": 9.702837423312883e-06, "objective/entropy": -130.06350708007812, "objective/kl": 31.98480987548828, "objective/non_score_reward": -1.5992405414581299, "objective/rlhf_reward": -5.07144889596097, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.149503707885742, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.654296875, "step": 465, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979965686798096 }, { "episode": 7472, "epoch": 0.04476878647349942, "loss/policy_avg": 0.11230316013097763, "lr": 9.70219836400818e-06, "objective/entropy": 11.579151153564453, "objective/kl": 34.1675910949707, "objective/non_score_reward": -1.7083796262741089, "objective/rlhf_reward": -5.3525657681778664, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 13.865779876708984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.708984375, "step": 466, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00162410736084 }, { "episode": 7488, "epoch": 0.04486465111262897, "loss/policy_avg": 0.2810555398464203, "lr": 9.701559304703477e-06, "objective/entropy": -138.13914489746094, "objective/kl": 22.91815948486328, "objective/non_score_reward": -1.145907998085022, "objective/rlhf_reward": -3.205029585448605, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 97.98136901855469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.775390625, "step": 467, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984302520751953 }, { "episode": 7504, "epoch": 0.044960515751758516, "loss/policy_avg": -0.09679757058620453, "lr": 9.700920245398774e-06, "objective/entropy": -44.23152160644531, "objective/kl": 34.52162170410156, "objective/non_score_reward": -1.726081132888794, "objective/rlhf_reward": -5.170991019407907, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 12.573694229125977, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.3896484375, "step": 468, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995930194854736 }, { "episode": 7520, "epoch": 0.045056380390888065, "loss/policy_avg": 0.2740531265735626, "lr": 9.700281186094071e-06, "objective/entropy": -64.87997436523438, "objective/kl": 30.31191062927246, "objective/non_score_reward": -1.5155954360961914, "objective/rlhf_reward": -4.329048738876978, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 12.677139282226562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.630859375, "step": 469, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981954097747803 }, { "episode": 7536, "epoch": 0.045152245030017614, "loss/policy_avg": 0.4849107265472412, "lr": 9.699642126789368e-06, "objective/entropy": -136.48355102539062, "objective/kl": 20.618619918823242, "objective/non_score_reward": -1.030930995941162, "objective/rlhf_reward": -2.6998918845253863, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 95.56924438476562, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.607421875, "step": 470, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9975948333740234 }, { "episode": 7552, "epoch": 0.04524810966914716, "loss/policy_avg": 0.05032477527856827, "lr": 9.699003067484663e-06, "objective/entropy": -116.99330139160156, "objective/kl": 31.927814483642578, "objective/non_score_reward": -1.596390724182129, "objective/rlhf_reward": -5.026312672828121, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.1943883895874023, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.744140625, "step": 471, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0004005432128906 }, { "episode": 7568, "epoch": 0.04534397430827671, "loss/policy_avg": 0.23768550157546997, "lr": 9.69836400817996e-06, "objective/entropy": -56.441200256347656, "objective/kl": 35.956565856933594, "objective/non_score_reward": -1.7978280782699585, "objective/rlhf_reward": -5.587192330423909, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 18.25104522705078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.75390625, "step": 472, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001668930053711 }, { "episode": 7584, "epoch": 0.04543983894740626, "loss/policy_avg": 0.18428431451320648, "lr": 9.697724948875257e-06, "objective/entropy": -12.911811828613281, "objective/kl": 31.440038681030273, "objective/non_score_reward": -1.5720020532608032, "objective/rlhf_reward": -4.554674939314523, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 33.68145751953125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.673828125, "step": 473, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997875690460205 }, { "episode": 7600, "epoch": 0.04553570358653581, "loss/policy_avg": 1.0267724990844727, "lr": 9.697085889570554e-06, "objective/entropy": -155.81759643554688, "objective/kl": 15.551814079284668, "objective/non_score_reward": -0.7775906920433044, "objective/rlhf_reward": -1.7317606593049586, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.7084851264953613, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.673828125, "step": 474, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998857975006104 }, { "episode": 7616, "epoch": 0.04563156822566536, "loss/policy_avg": 0.5301028490066528, "lr": 9.69644683026585e-06, "objective/entropy": -186.65789794921875, "objective/kl": 37.16144561767578, "objective/non_score_reward": -1.858072280883789, "objective/rlhf_reward": -5.876029699054316, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 48.150047302246094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 475, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9972370862960815 }, { "episode": 7632, "epoch": 0.04572743286479491, "loss/policy_avg": 0.2144310474395752, "lr": 9.695807770961146e-06, "objective/entropy": -153.16233825683594, "objective/kl": 31.742645263671875, "objective/non_score_reward": -1.5871323347091675, "objective/rlhf_reward": -4.832757556232151, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 43.260581970214844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62109375, "step": 476, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996619701385498 }, { "episode": 7648, "epoch": 0.04582329750392446, "loss/policy_avg": 0.1423683762550354, "lr": 9.695168711656443e-06, "objective/entropy": -101.34695434570312, "objective/kl": 34.40277099609375, "objective/non_score_reward": -1.7201385498046875, "objective/rlhf_reward": -5.555040988951845, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 6.133903503417969, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.740234375, "step": 477, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991512298583984 }, { "episode": 7664, "epoch": 0.045919162143054006, "loss/policy_avg": -0.20567180216312408, "lr": 9.694529652351738e-06, "objective/entropy": 1.8477153778076172, "objective/kl": 34.25542068481445, "objective/non_score_reward": -1.7127711772918701, "objective/rlhf_reward": -5.451084411144256, "objective/scores": 0.35, "policy/approxkl_avg": 90.96925354003906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.79296875, "step": 478, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978113174438477 }, { "episode": 7680, "epoch": 0.046015026782183555, "loss/policy_avg": 0.04285082221031189, "lr": 9.693890593047035e-06, "objective/entropy": -163.51800537109375, "objective/kl": 39.76237487792969, "objective/non_score_reward": -1.9881186485290527, "objective/rlhf_reward": -6.47152245324409, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 41.795677185058594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.64453125, "step": 479, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989352226257324 }, { "episode": 7696, "epoch": 0.046110891421313104, "loss/policy_avg": 0.30679094791412354, "lr": 9.693251533742331e-06, "objective/entropy": -137.21139526367188, "objective/kl": 24.817203521728516, "objective/non_score_reward": -1.2408602237701416, "objective/rlhf_reward": -3.407181172576502, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 7.010622024536133, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.740234375, "step": 480, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998010516166687 }, { "episode": 7712, "epoch": 0.04620675606044265, "loss/policy_avg": 0.14935311675071716, "lr": 9.692612474437628e-06, "objective/entropy": -133.61581420898438, "objective/kl": 28.18117904663086, "objective/non_score_reward": -1.4090590476989746, "objective/rlhf_reward": -4.276986324523373, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 41.72409439086914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.720703125, "step": 481, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9976277351379395 }, { "episode": 7728, "epoch": 0.0463026206995722, "loss/policy_avg": 0.4503282606601715, "lr": 9.691973415132925e-06, "objective/entropy": -185.92971801757812, "objective/kl": 24.44643783569336, "objective/non_score_reward": -1.22232186794281, "objective/rlhf_reward": -4.88928747177124, "objective/scores": 0.0, "policy/approxkl_avg": 26.91709327697754, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.55078125, "step": 482, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986282587051392 }, { "episode": 7744, "epoch": 0.04639848533870175, "loss/policy_avg": 0.7586182355880737, "lr": 9.691334355828222e-06, "objective/entropy": -136.83555603027344, "objective/kl": 27.66883087158203, "objective/non_score_reward": -1.38344144821167, "objective/rlhf_reward": -3.41105959035543, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 39.446250915527344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4287109375, "step": 483, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9947845935821533 }, { "episode": 7760, "epoch": 0.0464943499778313, "loss/policy_avg": 0.47291696071624756, "lr": 9.690695296523517e-06, "objective/entropy": 10.135929107666016, "objective/kl": 31.171567916870117, "objective/non_score_reward": -1.558578372001648, "objective/rlhf_reward": -4.572453921259033, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 15.718633651733398, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.611328125, "step": 484, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997343897819519 }, { "episode": 7776, "epoch": 0.04659021461696085, "loss/policy_avg": 0.19839856028556824, "lr": 9.690056237218814e-06, "objective/entropy": -64.7506332397461, "objective/kl": 25.45448112487793, "objective/non_score_reward": -1.2727241516113281, "objective/rlhf_reward": -2.690896427631378, "objective/scores": 0.6, "policy/approxkl_avg": 29.054779052734375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.76953125, "step": 485, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977914094924927 }, { "episode": 7792, "epoch": 0.0466860792560904, "loss/policy_avg": 0.16692940890789032, "lr": 9.689417177914111e-06, "objective/entropy": -200.1573028564453, "objective/kl": 16.24359893798828, "objective/non_score_reward": -0.8121800422668457, "objective/rlhf_reward": -1.6446000672021683, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.7478506565093994, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.58984375, "step": 486, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9997503757476807 }, { "episode": 7808, "epoch": 0.046781943895219946, "loss/policy_avg": 0.20832450687885284, "lr": 9.688778118609408e-06, "objective/entropy": -229.8734893798828, "objective/kl": 24.610809326171875, "objective/non_score_reward": -1.2305405139923096, "objective/rlhf_reward": -3.3180417156854443, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 50.22547912597656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 487, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9977538585662842 }, { "episode": 7824, "epoch": 0.046877808534349495, "loss/policy_avg": 0.584824800491333, "lr": 9.688139059304705e-06, "objective/entropy": -159.94088745117188, "objective/kl": 32.78782653808594, "objective/non_score_reward": -1.6393911838531494, "objective/rlhf_reward": -5.041793072017368, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 53.52165985107422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4560546875, "step": 488, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9974932670593262 }, { "episode": 7840, "epoch": 0.046973673173479044, "loss/policy_avg": 0.10657641291618347, "lr": 9.6875e-06, "objective/entropy": -117.46031188964844, "objective/kl": 22.680068969726562, "objective/non_score_reward": -1.1340034008026123, "objective/rlhf_reward": -2.802680269877116, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 31.437467575073242, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.640625, "step": 489, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984140396118164 }, { "episode": 7856, "epoch": 0.0470695378126086, "loss/policy_avg": 0.05225694179534912, "lr": 9.686860940695297e-06, "objective/entropy": -102.69722747802734, "objective/kl": 35.890769958496094, "objective/non_score_reward": -1.7945387363433838, "objective/rlhf_reward": -2.7781547069549557, "objective/scores": 1.1, "policy/approxkl_avg": 8.238727569580078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.693359375, "step": 490, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.996579885482788 }, { "episode": 7872, "epoch": 0.04716540245173815, "loss/policy_avg": 0.3118276000022888, "lr": 9.686221881390594e-06, "objective/entropy": -42.73939895629883, "objective/kl": 22.486095428466797, "objective/non_score_reward": -1.1243047714233398, "objective/rlhf_reward": -3.0733869268494525, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 29.32803726196289, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.609375, "step": 491, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991399049758911 }, { "episode": 7888, "epoch": 0.0472612670908677, "loss/policy_avg": 0.621738076210022, "lr": 9.68558282208589e-06, "objective/entropy": -26.77874755859375, "objective/kl": 33.77405548095703, "objective/non_score_reward": -1.688702940940857, "objective/rlhf_reward": -5.198552160468653, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 9.273128509521484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.73046875, "step": 492, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988332986831665 }, { "episode": 7904, "epoch": 0.04735713172999725, "loss/policy_avg": 0.16049662232398987, "lr": 9.684943762781188e-06, "objective/entropy": -84.04755401611328, "objective/kl": 25.384605407714844, "objective/non_score_reward": -1.2692303657531738, "objective/rlhf_reward": -2.1532023891222205, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 0.7223958373069763, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.646484375, "step": 493, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005576610565186 }, { "episode": 7920, "epoch": 0.047452996369126796, "loss/policy_avg": 0.3413264751434326, "lr": 9.684304703476484e-06, "objective/entropy": -118.85188293457031, "objective/kl": 30.77880859375, "objective/non_score_reward": -1.5389404296875, "objective/rlhf_reward": -4.422428623835246, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 19.30898666381836, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 494, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997934103012085 }, { "episode": 7936, "epoch": 0.047548861008256345, "loss/policy_avg": -0.016445789486169815, "lr": 9.68366564417178e-06, "objective/entropy": -211.39361572265625, "objective/kl": 26.587682723999023, "objective/non_score_reward": -1.3293840885162354, "objective/rlhf_reward": -2.917536354064941, "objective/scores": 0.6, "policy/approxkl_avg": 50.449562072753906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5234375, "step": 495, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99893319606781 }, { "episode": 7952, "epoch": 0.047644725647385894, "loss/policy_avg": -0.2565712034702301, "lr": 9.683026584867076e-06, "objective/entropy": -49.41560363769531, "objective/kl": 27.722068786621094, "objective/non_score_reward": -1.3861035108566284, "objective/rlhf_reward": -3.882554417074309, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 16.277629852294922, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.703125, "step": 496, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.003061294555664 }, { "episode": 7968, "epoch": 0.04774059028651544, "loss/policy_avg": 0.17001637816429138, "lr": 9.682387525562373e-06, "objective/entropy": -40.254676818847656, "objective/kl": 25.527742385864258, "objective/non_score_reward": -1.2763869762420654, "objective/rlhf_reward": -5.10554826259613, "objective/scores": 0.0, "policy/approxkl_avg": 19.284744262695312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6875, "step": 497, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9972081184387207 }, { "episode": 7984, "epoch": 0.04783645492564499, "loss/policy_avg": 0.08028728514909744, "lr": 9.68174846625767e-06, "objective/entropy": -23.79485321044922, "objective/kl": 23.14282989501953, "objective/non_score_reward": -1.1571415662765503, "objective/rlhf_reward": -4.628566324710846, "objective/scores": 0.0, "policy/approxkl_avg": 25.781452178955078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4580078125, "step": 498, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9980249404907227 }, { "episode": 8000, "epoch": 0.04793231956477454, "loss/policy_avg": 0.2174569070339203, "lr": 9.681109406952967e-06, "objective/entropy": -109.13389587402344, "objective/kl": 36.64985656738281, "objective/non_score_reward": -1.8324928283691406, "objective/rlhf_reward": -5.951369323817593, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 27.508981704711914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.525390625, "step": 499, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99745512008667 }, { "episode": 8016, "epoch": 0.04802818420390409, "loss/policy_avg": 0.13631635904312134, "lr": 9.680470347648262e-06, "objective/entropy": -99.519775390625, "objective/kl": 41.364810943603516, "objective/non_score_reward": -2.0682406425476074, "objective/rlhf_reward": -6.448134417804788, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 102.98858642578125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4970703125, "step": 500, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998145341873169 }, { "episode": 8032, "epoch": 0.04812404884303364, "loss/policy_avg": 0.059351589530706406, "lr": 9.67983128834356e-06, "objective/entropy": -226.86756896972656, "objective/kl": 27.588150024414062, "objective/non_score_reward": -1.379407525062561, "objective/rlhf_reward": -4.001858436855015, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.536296844482422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.65625, "step": 501, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976544380187988 }, { "episode": 8048, "epoch": 0.04821991348216319, "loss/policy_avg": 0.5408469438552856, "lr": 9.679192229038854e-06, "objective/entropy": 4.518913269042969, "objective/kl": 37.552825927734375, "objective/non_score_reward": -1.8776414394378662, "objective/rlhf_reward": -5.777232364813486, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 8.410907745361328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.787109375, "step": 502, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991774559020996 }, { "episode": 8064, "epoch": 0.048315778121292736, "loss/policy_avg": 1.089150071144104, "lr": 9.678553169734151e-06, "objective/entropy": -70.22102355957031, "objective/kl": 36.886138916015625, "objective/non_score_reward": -1.8443071842193604, "objective/rlhf_reward": -5.254522027746711, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 17.696430206298828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.662109375, "step": 503, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9944283962249756 }, { "episode": 8080, "epoch": 0.048411642760422285, "loss/policy_avg": 0.04815336689352989, "lr": 9.677914110429448e-06, "objective/entropy": -206.61251831054688, "objective/kl": 19.784542083740234, "objective/non_score_reward": -0.9892270565032959, "objective/rlhf_reward": -2.4411365626179538, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 10.987642288208008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 504, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982428550720215 }, { "episode": 8096, "epoch": 0.048507507399551834, "loss/policy_avg": 0.4511667788028717, "lr": 9.677275051124745e-06, "objective/entropy": -44.11040496826172, "objective/kl": 32.054603576660156, "objective/non_score_reward": -1.6027300357818604, "objective/rlhf_reward": -4.8951483605229225, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 161.647705078125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 505, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990203380584717 }, { "episode": 8112, "epoch": 0.04860337203868138, "loss/policy_avg": 0.43728113174438477, "lr": 9.676635991820042e-06, "objective/entropy": -167.46401977539062, "objective/kl": 25.358474731445312, "objective/non_score_reward": -1.2679238319396973, "objective/rlhf_reward": -5.071695148944855, "objective/scores": 0.0, "policy/approxkl_avg": 6.505180358886719, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.67578125, "step": 506, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999653697013855 }, { "episode": 8128, "epoch": 0.04869923667781093, "loss/policy_avg": 0.049704909324645996, "lr": 9.675996932515339e-06, "objective/entropy": -68.84889221191406, "objective/kl": 23.506563186645508, "objective/non_score_reward": -1.1753281354904175, "objective/rlhf_reward": -3.3227105523027003, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.8750853538513184, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.783203125, "step": 507, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99936842918396 }, { "episode": 8144, "epoch": 0.04879510131694048, "loss/policy_avg": 0.23126532137393951, "lr": 9.675357873210634e-06, "objective/entropy": -193.32493591308594, "objective/kl": 30.975135803222656, "objective/non_score_reward": -1.5487568378448486, "objective/rlhf_reward": -4.072320940271888, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 30.721832275390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.599609375, "step": 508, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.994474172592163 }, { "episode": 8160, "epoch": 0.04889096595607003, "loss/policy_avg": 0.6136177778244019, "lr": 9.67471881390593e-06, "objective/entropy": 35.12611770629883, "objective/kl": 24.636138916015625, "objective/non_score_reward": -1.2318068742752075, "objective/rlhf_reward": -2.979816268162663, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 31.945526123046875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.736328125, "step": 509, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001587390899658 }, { "episode": 8176, "epoch": 0.04898683059519958, "loss/policy_avg": 0.07654842734336853, "lr": 9.674079754601228e-06, "objective/entropy": -218.7822265625, "objective/kl": 30.072967529296875, "objective/non_score_reward": -1.5036484003067017, "objective/rlhf_reward": -3.8918873689332347, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 42.21351623535156, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.763671875, "step": 510, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9961347579956055 }, { "episode": 8192, "epoch": 0.04908269523432913, "loss/policy_avg": 0.4642539322376251, "lr": 9.673440695296525e-06, "objective/entropy": -61.26002502441406, "objective/kl": 28.09502410888672, "objective/non_score_reward": -1.4047513008117676, "objective/rlhf_reward": -4.168407420726165, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 28.139495849609375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.50390625, "step": 511, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988558292388916 }, { "episode": 8208, "epoch": 0.04917855987345868, "loss/policy_avg": -0.1496490240097046, "lr": 9.672801635991821e-06, "objective/entropy": -237.9604034423828, "objective/kl": 24.80710220336914, "objective/non_score_reward": -1.2403552532196045, "objective/rlhf_reward": -3.5828184867776454, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 9.494747161865234, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.671875, "step": 512, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000725507736206 }, { "episode": 8224, "epoch": 0.049274424512588226, "loss/policy_avg": -0.18209466338157654, "lr": 9.672162576687117e-06, "objective/entropy": -180.66116333007812, "objective/kl": 25.97962188720703, "objective/non_score_reward": -1.2989810705184937, "objective/rlhf_reward": -3.073217930571113, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 41.079193115234375, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.73046875, "step": 513, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997527837753296 }, { "episode": 8240, "epoch": 0.049370289151717775, "loss/policy_avg": 0.3504701852798462, "lr": 9.671523517382413e-06, "objective/entropy": -98.80787658691406, "objective/kl": 26.576587677001953, "objective/non_score_reward": -1.3288295269012451, "objective/rlhf_reward": -0.9153180480003353, "objective/scores": 1.1, "policy/approxkl_avg": 13.758487701416016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6953125, "step": 514, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998854160308838 }, { "episode": 8256, "epoch": 0.049466153790847324, "loss/policy_avg": 0.48611417412757874, "lr": 9.67088445807771e-06, "objective/entropy": -128.45774841308594, "objective/kl": 29.784334182739258, "objective/non_score_reward": -1.4892168045043945, "objective/rlhf_reward": -4.223533527056375, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.2566263675689697, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 515, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001587867736816 }, { "episode": 8272, "epoch": 0.04956201842997687, "loss/policy_avg": -0.13057222962379456, "lr": 9.670245398773007e-06, "objective/entropy": -146.07781982421875, "objective/kl": 31.182106018066406, "objective/non_score_reward": -1.5591052770614624, "objective/rlhf_reward": -3.8364211082458493, "objective/scores": 0.6, "policy/approxkl_avg": 15.76829719543457, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.576171875, "step": 516, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0000805854797363 }, { "episode": 8288, "epoch": 0.04965788306910642, "loss/policy_avg": 0.637583315372467, "lr": 9.669606339468304e-06, "objective/entropy": -144.37762451171875, "objective/kl": 27.648868560791016, "objective/non_score_reward": -1.3824436664581299, "objective/rlhf_reward": -4.0140026448094215, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.933715343475342, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 517, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995014667510986 }, { "episode": 8304, "epoch": 0.04975374770823597, "loss/policy_avg": 0.23517751693725586, "lr": 9.668967280163601e-06, "objective/entropy": -130.0078125, "objective/kl": 26.889904022216797, "objective/non_score_reward": -1.344495415687561, "objective/rlhf_reward": -3.927383343787536, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 35.43697738647461, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.81640625, "step": 518, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9984192848205566 }, { "episode": 8320, "epoch": 0.04984961234736552, "loss/policy_avg": -0.05650443956255913, "lr": 9.668328220858896e-06, "objective/entropy": -214.1605682373047, "objective/kl": 21.148624420166016, "objective/non_score_reward": -1.0574312210083008, "objective/rlhf_reward": -2.673465876784876, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 18.935588836669922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.599609375, "step": 519, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993261098861694 }, { "episode": 8336, "epoch": 0.04994547698649507, "loss/policy_avg": -0.034447960555553436, "lr": 9.667689161554193e-06, "objective/entropy": -158.14088439941406, "objective/kl": 32.29146957397461, "objective/non_score_reward": -1.61457359790802, "objective/rlhf_reward": -4.902035086360529, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.876145362854004, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58984375, "step": 520, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993882179260254 }, { "episode": 8352, "epoch": 0.05004134162562462, "loss/policy_avg": -0.13744737207889557, "lr": 9.66705010224949e-06, "objective/entropy": -204.13546752929688, "objective/kl": 28.699504852294922, "objective/non_score_reward": -1.4349753856658936, "objective/rlhf_reward": -4.361299076167446, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.3828086853027344, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.66796875, "step": 521, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0017244815826416 }, { "episode": 8368, "epoch": 0.05013720626475417, "loss/policy_avg": 0.13512714207172394, "lr": 9.666411042944787e-06, "objective/entropy": -234.03375244140625, "objective/kl": 27.24090576171875, "objective/non_score_reward": -1.3620452880859375, "objective/rlhf_reward": -3.932409131320652, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 27.1795654296875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.689453125, "step": 522, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999112844467163 }, { "episode": 8384, "epoch": 0.050233070903883716, "loss/policy_avg": -0.011349002830684185, "lr": 9.665771983640082e-06, "objective/entropy": -252.35935974121094, "objective/kl": 35.68749237060547, "objective/non_score_reward": -1.784374713897705, "objective/rlhf_reward": -5.7588963890946925, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 13.969385147094727, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626953125, "step": 523, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9982863664627075 }, { "episode": 8400, "epoch": 0.050328935543013265, "loss/policy_avg": 0.03610409051179886, "lr": 9.665132924335379e-06, "objective/entropy": -18.527732849121094, "objective/kl": 31.889944076538086, "objective/non_score_reward": -1.5944972038269043, "objective/rlhf_reward": -4.927390317530975, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 69.35887145996094, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.86328125, "step": 524, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999839425086975 }, { "episode": 8416, "epoch": 0.050424800182142814, "loss/policy_avg": 0.4427942633628845, "lr": 9.664493865030676e-06, "objective/entropy": -203.7809295654297, "objective/kl": 25.36702537536621, "objective/non_score_reward": -1.2683511972427368, "objective/rlhf_reward": -3.6495729281502642, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 22.38974380493164, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.828125, "step": 525, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989018440246582 }, { "episode": 8432, "epoch": 0.05052066482127236, "loss/policy_avg": 1.6773953437805176, "lr": 9.663854805725971e-06, "objective/entropy": -146.93841552734375, "objective/kl": 37.069419860839844, "objective/non_score_reward": -1.853471040725708, "objective/rlhf_reward": -5.990052063663569, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.231493949890137, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.53515625, "step": 526, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981741905212402 }, { "episode": 8448, "epoch": 0.05061652946040191, "loss/policy_avg": -0.08897572010755539, "lr": 9.663215746421268e-06, "objective/entropy": -158.65708923339844, "objective/kl": 23.60004997253418, "objective/non_score_reward": -1.1800025701522827, "objective/rlhf_reward": -3.394497547179384, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 10.824882507324219, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6953125, "step": 527, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9995331764221191 }, { "episode": 8464, "epoch": 0.05071239409953146, "loss/policy_avg": 0.024341005831956863, "lr": 9.662576687116565e-06, "objective/entropy": -174.72035217285156, "objective/kl": 29.104461669921875, "objective/non_score_reward": -1.4552230834960938, "objective/rlhf_reward": -4.479256918936401, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 17.054231643676758, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.470703125, "step": 528, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999741554260254 }, { "episode": 8480, "epoch": 0.05080825873866101, "loss/policy_avg": 0.257159948348999, "lr": 9.661937627811862e-06, "objective/entropy": -200.30184936523438, "objective/kl": 23.69171905517578, "objective/non_score_reward": -1.1845859289169312, "objective/rlhf_reward": -3.338343775272369, "objective/scores": 0.35, "policy/approxkl_avg": 6.550008773803711, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.595703125, "step": 529, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9984736442565918 }, { "episode": 8496, "epoch": 0.05090412337779056, "loss/policy_avg": 0.4184650182723999, "lr": 9.661298568507158e-06, "objective/entropy": -344.7420959472656, "objective/kl": 24.219188690185547, "objective/non_score_reward": -1.2109594345092773, "objective/rlhf_reward": -3.4652354205525935, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 67.58980560302734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 530, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985125064849854 }, { "episode": 8512, "epoch": 0.05099998801692011, "loss/policy_avg": -0.0187949538230896, "lr": 9.660659509202455e-06, "objective/entropy": -14.01883316040039, "objective/kl": 29.49643325805664, "objective/non_score_reward": -1.47482168674469, "objective/rlhf_reward": -4.520684697715145, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 6.090343475341797, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.59765625, "step": 531, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0014190673828125 }, { "episode": 8528, "epoch": 0.051095852656049656, "loss/policy_avg": 0.5480527877807617, "lr": 9.66002044989775e-06, "objective/entropy": -169.82949829101562, "objective/kl": 34.57899475097656, "objective/non_score_reward": -1.728949785232544, "objective/rlhf_reward": -5.434846642430186, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 7.255028247833252, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.638671875, "step": 532, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971615076065063 }, { "episode": 8544, "epoch": 0.051191717295179205, "loss/policy_avg": 0.2761814594268799, "lr": 9.659381390593047e-06, "objective/entropy": -100.77452850341797, "objective/kl": 36.835365295410156, "objective/non_score_reward": -1.8417682647705078, "objective/rlhf_reward": -6.007823192809505, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 50.438026428222656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.625, "step": 533, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985384941101074 }, { "episode": 8560, "epoch": 0.051287581934308754, "loss/policy_avg": 0.4119563698768616, "lr": 9.658742331288344e-06, "objective/entropy": -65.70556640625, "objective/kl": 29.577213287353516, "objective/non_score_reward": -1.47886061668396, "objective/rlhf_reward": -3.792736174837623, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.75493049621582, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4990234375, "step": 534, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002697467803955 }, { "episode": 8576, "epoch": 0.0513834465734383, "loss/policy_avg": 0.12609338760375977, "lr": 9.658103271983641e-06, "objective/entropy": -150.71954345703125, "objective/kl": 28.952709197998047, "objective/non_score_reward": -1.447635531425476, "objective/rlhf_reward": -4.3905422449111935, "objective/scores": 0.35, "policy/approxkl_avg": 34.924835205078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.615234375, "step": 535, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004661083221436 }, { "episode": 8592, "epoch": 0.05147931121256785, "loss/policy_avg": 0.014640828594565392, "lr": 9.657464212678938e-06, "objective/entropy": -37.74507141113281, "objective/kl": 25.910266876220703, "objective/non_score_reward": -1.295513391494751, "objective/rlhf_reward": -0.7820532083511349, "objective/scores": 1.1, "policy/approxkl_avg": 2.0191965103149414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.751953125, "step": 536, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0010244846343994 }, { "episode": 8608, "epoch": 0.0515751758516974, "loss/policy_avg": 0.04429921880364418, "lr": 9.656825153374235e-06, "objective/entropy": -26.176483154296875, "objective/kl": 32.8004264831543, "objective/non_score_reward": -1.6400213241577148, "objective/rlhf_reward": -4.826751814285913, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 42.128135681152344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.908203125, "step": 537, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0013113021850586 }, { "episode": 8624, "epoch": 0.05167104049082695, "loss/policy_avg": 0.46547916531562805, "lr": 9.65618609406953e-06, "objective/entropy": 7.776313781738281, "objective/kl": 28.19791030883789, "objective/non_score_reward": -1.4098955392837524, "objective/rlhf_reward": -3.906248764197031, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.504173755645752, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8828125, "step": 538, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998396635055542 }, { "episode": 8640, "epoch": 0.0517669051299565, "loss/policy_avg": 0.0001214742660522461, "lr": 9.655547034764827e-06, "objective/entropy": -112.6850357055664, "objective/kl": 31.756372451782227, "objective/non_score_reward": -1.5878187417984009, "objective/rlhf_reward": -4.228568734900032, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.7504100799560547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.546875, "step": 539, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0001778602600098 }, { "episode": 8656, "epoch": 0.05186276976908605, "loss/policy_avg": 0.41524794697761536, "lr": 9.654907975460124e-06, "objective/entropy": -135.01878356933594, "objective/kl": 23.119266510009766, "objective/non_score_reward": -1.1559633016586304, "objective/rlhf_reward": -3.0675939609676153, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 26.581480026245117, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6015625, "step": 540, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9962246417999268 }, { "episode": 8672, "epoch": 0.0519586344082156, "loss/policy_avg": 0.3321428894996643, "lr": 9.65426891615542e-06, "objective/entropy": -5.44740104675293, "objective/kl": 39.89240264892578, "objective/non_score_reward": -1.9946203231811523, "objective/rlhf_reward": -7.97848105430603, "objective/scores": 0.0, "policy/approxkl_avg": 67.52932739257812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 541, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9986319541931152 }, { "episode": 8688, "epoch": 0.052054499047345146, "loss/policy_avg": 0.22704890370368958, "lr": 9.653629856850718e-06, "objective/entropy": 23.631000518798828, "objective/kl": 22.43924331665039, "objective/non_score_reward": -1.121962308883667, "objective/rlhf_reward": -3.109246918050152, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 40.600868225097656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.708984375, "step": 542, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0008883476257324 }, { "episode": 8704, "epoch": 0.052150363686474695, "loss/policy_avg": 0.6167892217636108, "lr": 9.652990797546013e-06, "objective/entropy": 8.02947998046875, "objective/kl": 34.78337478637695, "objective/non_score_reward": -1.739168643951416, "objective/rlhf_reward": -5.556674695014953, "objective/scores": 0.35, "policy/approxkl_avg": 7.763035774230957, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.55859375, "step": 543, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983458518981934 }, { "episode": 8720, "epoch": 0.052246228325604244, "loss/policy_avg": 0.1720658838748932, "lr": 9.65235173824131e-06, "objective/entropy": 0.5252876281738281, "objective/kl": 31.73941993713379, "objective/non_score_reward": -1.5869710445404053, "objective/rlhf_reward": -4.79162499209936, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.366281509399414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.76953125, "step": 544, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988775253295898 }, { "episode": 8736, "epoch": 0.05234209296473379, "loss/policy_avg": 0.07084909081459045, "lr": 9.651712678936605e-06, "objective/entropy": -50.734527587890625, "objective/kl": 24.657032012939453, "objective/non_score_reward": -1.2328516244888306, "objective/rlhf_reward": -3.1065776899185886, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 12.337860107421875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.521484375, "step": 545, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985425472259521 }, { "episode": 8752, "epoch": 0.05243795760386334, "loss/policy_avg": -0.053861357271671295, "lr": 9.651073619631902e-06, "objective/entropy": -242.29559326171875, "objective/kl": 21.178913116455078, "objective/non_score_reward": -1.058945655822754, "objective/rlhf_reward": -2.6316629386583146, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 23.818538665771484, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.62109375, "step": 546, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0112152099609375 }, { "episode": 8768, "epoch": 0.0525338222429929, "loss/policy_avg": -0.008508548140525818, "lr": 9.650434560327199e-06, "objective/entropy": -46.92424011230469, "objective/kl": 39.04132843017578, "objective/non_score_reward": -1.952066421508789, "objective/rlhf_reward": -6.429663398352963, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 15.27535629272461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4677734375, "step": 547, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982926845550537 }, { "episode": 8784, "epoch": 0.052629686882122446, "loss/policy_avg": 0.17654258012771606, "lr": 9.649795501022496e-06, "objective/entropy": -44.7242431640625, "objective/kl": 19.804813385009766, "objective/non_score_reward": -0.9902406930923462, "objective/rlhf_reward": -2.635450038939638, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 39.75682067871094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.76953125, "step": 548, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002223014831543 }, { "episode": 8800, "epoch": 0.052725551521251995, "loss/policy_avg": 0.46367156505584717, "lr": 9.649156441717792e-06, "objective/entropy": -132.18556213378906, "objective/kl": 38.18450927734375, "objective/non_score_reward": -1.909225344657898, "objective/rlhf_reward": -6.0327816343942455, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 24.263263702392578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7421875, "step": 549, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9974383115768433 }, { "episode": 8816, "epoch": 0.052821416160381544, "loss/policy_avg": 0.2747136950492859, "lr": 9.64851738241309e-06, "objective/entropy": -91.26388549804688, "objective/kl": 28.735111236572266, "objective/non_score_reward": -1.4367555379867554, "objective/rlhf_reward": -4.085162764013396, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.113122940063477, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.626953125, "step": 550, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000535011291504 }, { "episode": 8832, "epoch": 0.05291728079951109, "loss/policy_avg": 0.031243963167071342, "lr": 9.647878323108384e-06, "objective/entropy": -40.358192443847656, "objective/kl": 31.673667907714844, "objective/non_score_reward": -1.5836834907531738, "objective/rlhf_reward": -4.993098309546142, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 78.17581939697266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.796875, "step": 551, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989157915115356 }, { "episode": 8848, "epoch": 0.05301314543864064, "loss/policy_avg": -0.28017422556877136, "lr": 9.647239263803681e-06, "objective/entropy": -100.97856140136719, "objective/kl": 33.18678283691406, "objective/non_score_reward": -1.659339189529419, "objective/rlhf_reward": -6.637356638908386, "objective/scores": 0.0, "policy/approxkl_avg": 6.006505012512207, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.572265625, "step": 552, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003824234008789 }, { "episode": 8864, "epoch": 0.05310901007777019, "loss/policy_avg": 0.04892890527844429, "lr": 9.646600204498978e-06, "objective/entropy": -136.31918334960938, "objective/kl": 19.06879997253418, "objective/non_score_reward": -0.9534400105476379, "objective/rlhf_reward": -2.2575007369190008, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.5354987382888794, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.572265625, "step": 553, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003046989440918 }, { "episode": 8880, "epoch": 0.05320487471689974, "loss/policy_avg": 0.1114959716796875, "lr": 9.645961145194275e-06, "objective/entropy": -125.14915466308594, "objective/kl": 41.65575408935547, "objective/non_score_reward": -2.0827877521514893, "objective/rlhf_reward": -6.383740137295659, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 12.4759521484375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.66796875, "step": 554, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973247051239014 }, { "episode": 8896, "epoch": 0.05330073935602929, "loss/policy_avg": 0.2784144878387451, "lr": 9.645322085889572e-06, "objective/entropy": -42.213340759277344, "objective/kl": 34.43170928955078, "objective/non_score_reward": -1.7215855121612549, "objective/rlhf_reward": -6.8863421976566315, "objective/scores": 0.0, "policy/approxkl_avg": 37.5791015625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.546875, "step": 555, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974336624145508 }, { "episode": 8912, "epoch": 0.05339660399515884, "loss/policy_avg": -0.0683375895023346, "lr": 9.644683026584867e-06, "objective/entropy": -94.292724609375, "objective/kl": 29.925048828125, "objective/non_score_reward": -1.4962522983551025, "objective/rlhf_reward": -4.4287500669627935, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 0.9679741263389587, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.53125, "step": 556, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002323627471924 }, { "episode": 8928, "epoch": 0.05349246863428839, "loss/policy_avg": 0.3528517484664917, "lr": 9.644043967280164e-06, "objective/entropy": 100.1601791381836, "objective/kl": 29.87194061279297, "objective/non_score_reward": -1.4935970306396484, "objective/rlhf_reward": -4.493435802872538, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 21.40321922302246, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.54296875, "step": 557, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999911904335022 }, { "episode": 8944, "epoch": 0.053588333273417936, "loss/policy_avg": 0.15664523839950562, "lr": 9.643404907975461e-06, "objective/entropy": -163.13458251953125, "objective/kl": 43.485382080078125, "objective/non_score_reward": -2.174269199371338, "objective/rlhf_reward": -6.297076797485351, "objective/scores": 0.6, "policy/approxkl_avg": 28.333932876586914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.52734375, "step": 558, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9964426755905151 }, { "episode": 8960, "epoch": 0.053684197912547485, "loss/policy_avg": 0.6344835162162781, "lr": 9.642765848670758e-06, "objective/entropy": -252.752685546875, "objective/kl": 33.16960144042969, "objective/non_score_reward": -1.658479928970337, "objective/rlhf_reward": -5.255317785827023, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 52.37012481689453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.771484375, "step": 559, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0004310607910156 }, { "episode": 8976, "epoch": 0.053780062551677034, "loss/policy_avg": 0.19869406521320343, "lr": 9.642126789366055e-06, "objective/entropy": -50.086647033691406, "objective/kl": 30.926883697509766, "objective/non_score_reward": -1.5463443994522095, "objective/rlhf_reward": -4.629118292537287, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 26.995628356933594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626953125, "step": 560, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9950168132781982 }, { "episode": 8992, "epoch": 0.05387592719080658, "loss/policy_avg": -0.010918349027633667, "lr": 9.641487730061352e-06, "objective/entropy": -168.9771728515625, "objective/kl": 22.5106201171875, "objective/non_score_reward": -1.1255309581756592, "objective/rlhf_reward": -3.1604882984453733, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 20.162094116210938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.763671875, "step": 561, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001406669616699 }, { "episode": 9008, "epoch": 0.05397179182993613, "loss/policy_avg": 0.4963573217391968, "lr": 9.640848670756647e-06, "objective/entropy": -159.58302307128906, "objective/kl": 34.39787673950195, "objective/non_score_reward": -1.7198940515518188, "objective/rlhf_reward": -5.455743868549433, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 32.154441833496094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59375, "step": 562, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99989652633667 }, { "episode": 9024, "epoch": 0.05406765646906568, "loss/policy_avg": 0.4512660503387451, "lr": 9.640209611451944e-06, "objective/entropy": -112.33628845214844, "objective/kl": 34.371681213378906, "objective/non_score_reward": -1.7185840606689453, "objective/rlhf_reward": -5.515086495612545, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 4.578237056732178, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.599609375, "step": 563, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984909296035767 }, { "episode": 9040, "epoch": 0.05416352110819523, "loss/policy_avg": 0.08781366050243378, "lr": 9.63957055214724e-06, "objective/entropy": -39.49800491333008, "objective/kl": 33.1617431640625, "objective/non_score_reward": -1.6580872535705566, "objective/rlhf_reward": -4.232348775863647, "objective/scores": 0.6, "policy/approxkl_avg": 4.19449520111084, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.677734375, "step": 564, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000304698944092 }, { "episode": 9056, "epoch": 0.05425938574732478, "loss/policy_avg": 0.02701903134584427, "lr": 9.638931492842537e-06, "objective/entropy": -135.10118103027344, "objective/kl": 34.19304656982422, "objective/non_score_reward": -1.7096521854400635, "objective/rlhf_reward": -5.388010840030059, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 18.33478546142578, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.61328125, "step": 565, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999070644378662 }, { "episode": 9072, "epoch": 0.05435525038645433, "loss/policy_avg": 0.2804332375526428, "lr": 9.638292433537834e-06, "objective/entropy": -100.01052856445312, "objective/kl": 28.388795852661133, "objective/non_score_reward": -1.4194397926330566, "objective/rlhf_reward": -5.677759170532227, "objective/scores": 0.0, "policy/approxkl_avg": 7.587360382080078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.68359375, "step": 566, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0005486011505127 }, { "episode": 9088, "epoch": 0.05445111502558388, "loss/policy_avg": 0.4314262866973877, "lr": 9.63765337423313e-06, "objective/entropy": -130.2495574951172, "objective/kl": 35.38700866699219, "objective/non_score_reward": -1.7693501710891724, "objective/rlhf_reward": -5.4155414156323545, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 44.93388366699219, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.75390625, "step": 567, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9968568086624146 }, { "episode": 9104, "epoch": 0.054546979664713426, "loss/policy_avg": 0.3399587869644165, "lr": 9.637014314928426e-06, "objective/entropy": -247.61073303222656, "objective/kl": 28.445119857788086, "objective/non_score_reward": -1.4222559928894043, "objective/rlhf_reward": -3.864195342334818, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.162724018096924, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.607421875, "step": 568, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984673261642456 }, { "episode": 9120, "epoch": 0.054642844303842975, "loss/policy_avg": 0.5520263314247131, "lr": 9.636375255623721e-06, "objective/entropy": -97.92376708984375, "objective/kl": 26.055057525634766, "objective/non_score_reward": -1.30275297164917, "objective/rlhf_reward": -3.088305356279884, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 36.18694305419922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.494140625, "step": 569, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0012941360473633 }, { "episode": 9136, "epoch": 0.054738708942972523, "loss/policy_avg": 0.09734541922807693, "lr": 9.635736196319018e-06, "objective/entropy": -196.53872680664062, "objective/kl": 23.71702003479004, "objective/non_score_reward": -1.185850977897644, "objective/rlhf_reward": -4.743403911590576, "objective/scores": 0.0, "policy/approxkl_avg": 2.213500738143921, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.580078125, "step": 570, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993475675582886 }, { "episode": 9152, "epoch": 0.05483457358210207, "loss/policy_avg": 0.4516823887825012, "lr": 9.635097137014315e-06, "objective/entropy": -126.11761474609375, "objective/kl": 28.336185455322266, "objective/non_score_reward": -1.4168094396591187, "objective/rlhf_reward": -1.2672375202178952, "objective/scores": 1.1, "policy/approxkl_avg": 44.684326171875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.775390625, "step": 571, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990687370300293 }, { "episode": 9168, "epoch": 0.05493043822123162, "loss/policy_avg": 0.34894299507141113, "lr": 9.634458077709612e-06, "objective/entropy": -3.410472869873047, "objective/kl": 35.99509048461914, "objective/non_score_reward": -1.7997545003890991, "objective/rlhf_reward": -5.87350514891736, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.621858596801758, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.634765625, "step": 572, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0019896030426025 }, { "episode": 9184, "epoch": 0.05502630286036117, "loss/policy_avg": 0.1023169457912445, "lr": 9.633819018404909e-06, "objective/entropy": -180.73724365234375, "objective/kl": 24.693328857421875, "objective/non_score_reward": -1.2346664667129517, "objective/rlhf_reward": -3.5386658668518063, "objective/scores": 0.35, "policy/approxkl_avg": 22.89309310913086, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.64453125, "step": 573, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981255531311035 }, { "episode": 9200, "epoch": 0.05512216749949072, "loss/policy_avg": 0.2509443163871765, "lr": 9.633179959100206e-06, "objective/entropy": -268.43072509765625, "objective/kl": 28.437435150146484, "objective/non_score_reward": -1.4218716621398926, "objective/rlhf_reward": -4.131227611508921, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 60.228729248046875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.78125, "step": 574, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000219345092773 }, { "episode": 9216, "epoch": 0.05521803213862027, "loss/policy_avg": -0.04683633893728256, "lr": 9.632540899795501e-06, "objective/entropy": -70.71329498291016, "objective/kl": 38.51101303100586, "objective/non_score_reward": -1.9255508184432983, "objective/rlhf_reward": -5.877374465736459, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 3.3532156944274902, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.615234375, "step": 575, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000458002090454 }, { "episode": 9232, "epoch": 0.05531389677774982, "loss/policy_avg": 0.25571292638778687, "lr": 9.631901840490798e-06, "objective/entropy": -197.88787841796875, "objective/kl": 25.574037551879883, "objective/non_score_reward": -1.278701901435852, "objective/rlhf_reward": -3.3814741532007853, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 6.096738815307617, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.671875, "step": 576, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001577854156494 }, { "episode": 9248, "epoch": 0.055409761416879366, "loss/policy_avg": 0.7064580917358398, "lr": 9.631262781186095e-06, "objective/entropy": -150.29953002929688, "objective/kl": 30.821884155273438, "objective/non_score_reward": -1.5410943031311035, "objective/rlhf_reward": -4.43104387919108, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 43.45115280151367, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.60546875, "step": 577, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9980316162109375 }, { "episode": 9264, "epoch": 0.055505626056008915, "loss/policy_avg": 0.20062510669231415, "lr": 9.630623721881392e-06, "objective/entropy": -158.88388061523438, "objective/kl": 28.73421859741211, "objective/non_score_reward": -1.4367109537124634, "objective/rlhf_reward": -4.346843814849853, "objective/scores": 0.35, "policy/approxkl_avg": 12.110857963562012, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 578, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998002290725708 }, { "episode": 9280, "epoch": 0.055601490695138464, "loss/policy_avg": 0.08450721949338913, "lr": 9.629984662576689e-06, "objective/entropy": -250.45445251464844, "objective/kl": 27.57752227783203, "objective/non_score_reward": -1.3788762092590332, "objective/rlhf_reward": -4.064906816096649, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 17.175188064575195, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5859375, "step": 579, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997262716293335 }, { "episode": 9296, "epoch": 0.05569735533426801, "loss/policy_avg": 0.41482874751091003, "lr": 9.629345603271984e-06, "objective/entropy": -177.06607055664062, "objective/kl": 29.43456268310547, "objective/non_score_reward": -1.4717282056808472, "objective/rlhf_reward": -2.9631939872514934, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 50.86977005004883, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.595703125, "step": 580, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005576610565186 }, { "episode": 9312, "epoch": 0.05579321997339756, "loss/policy_avg": 0.20043331384658813, "lr": 9.62870654396728e-06, "objective/entropy": -224.79660034179688, "objective/kl": 23.171340942382812, "objective/non_score_reward": -1.1585670709609985, "objective/rlhf_reward": -2.6868569953011825, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.841948986053467, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.603515625, "step": 581, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.00144362449646 }, { "episode": 9328, "epoch": 0.05588908461252711, "loss/policy_avg": 0.28447139263153076, "lr": 9.628067484662578e-06, "objective/entropy": -44.1309814453125, "objective/kl": 42.387351989746094, "objective/non_score_reward": -2.1193675994873047, "objective/rlhf_reward": -7.151957724124117, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 20.72610092163086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.419921875, "step": 582, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971909523010254 }, { "episode": 9344, "epoch": 0.05598494925165666, "loss/policy_avg": 0.09533769637346268, "lr": 9.627428425357874e-06, "objective/entropy": -218.9058380126953, "objective/kl": 27.360652923583984, "objective/non_score_reward": -1.368032693862915, "objective/rlhf_reward": -4.021532396884307, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 11.28432846069336, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 583, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9974052906036377 }, { "episode": 9360, "epoch": 0.05608081389078621, "loss/policy_avg": 0.5065032243728638, "lr": 9.626789366053171e-06, "objective/entropy": -231.38427734375, "objective/kl": 32.08224105834961, "objective/non_score_reward": -1.604112148284912, "objective/rlhf_reward": -5.0748127012545154, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 40.948760986328125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.69921875, "step": 584, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9988150596618652 }, { "episode": 9376, "epoch": 0.05617667852991576, "loss/policy_avg": 0.6530688405036926, "lr": 9.626150306748468e-06, "objective/entropy": -116.65798950195312, "objective/kl": 31.407730102539062, "objective/non_score_reward": -1.570386528968811, "objective/rlhf_reward": -4.902944007006985, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 13.348186492919922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.54296875, "step": 585, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000188112258911 }, { "episode": 9392, "epoch": 0.05627254316904531, "loss/policy_avg": -0.06093317270278931, "lr": 9.625511247443763e-06, "objective/entropy": -245.7208251953125, "objective/kl": 22.28873634338379, "objective/non_score_reward": -1.1144368648529053, "objective/rlhf_reward": -2.33504098869947, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.7080774307250977, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.681640625, "step": 586, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002075672149658 }, { "episode": 9408, "epoch": 0.056368407808174856, "loss/policy_avg": 0.4493389129638672, "lr": 9.62487218813906e-06, "objective/entropy": -11.156410217285156, "objective/kl": 29.71312141418457, "objective/non_score_reward": -1.4856561422348022, "objective/rlhf_reward": -4.117795641693186, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 18.012893676757812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.759765625, "step": 587, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000352382659912 }, { "episode": 9424, "epoch": 0.056464272447304405, "loss/policy_avg": 0.3274408280849457, "lr": 9.624233128834357e-06, "objective/entropy": -116.3506088256836, "objective/kl": 35.94437026977539, "objective/non_score_reward": -1.7972185611724854, "objective/rlhf_reward": -4.788874185085296, "objective/scores": 0.6, "policy/approxkl_avg": 17.158645629882812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.69140625, "step": 588, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996552467346191 }, { "episode": 9440, "epoch": 0.056560137086433954, "loss/policy_avg": 0.879096508026123, "lr": 9.623594069529654e-06, "objective/entropy": -152.50155639648438, "objective/kl": 32.464576721191406, "objective/non_score_reward": -1.623228669166565, "objective/rlhf_reward": -5.069082756240931, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 70.49058532714844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.849609375, "step": 589, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001586437225342 }, { "episode": 9456, "epoch": 0.0566560017255635, "loss/policy_avg": 0.2921786904335022, "lr": 9.62295501022495e-06, "objective/entropy": -177.27088928222656, "objective/kl": 39.783531188964844, "objective/non_score_reward": -1.989176630973816, "objective/rlhf_reward": -6.57810423621307, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 77.26689147949219, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6796875, "step": 590, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9989970922470093 }, { "episode": 9472, "epoch": 0.05675186636469305, "loss/policy_avg": 0.3912142515182495, "lr": 9.622315950920246e-06, "objective/entropy": -120.1540756225586, "objective/kl": 31.21270179748535, "objective/non_score_reward": -1.5606350898742676, "objective/rlhf_reward": -3.842540299892425, "objective/scores": 0.6, "policy/approxkl_avg": 25.256790161132812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.654296875, "step": 591, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9980988502502441 }, { "episode": 9488, "epoch": 0.0568477310038226, "loss/policy_avg": 0.04369340091943741, "lr": 9.621676891615543e-06, "objective/entropy": -277.40753173828125, "objective/kl": 29.685585021972656, "objective/non_score_reward": -1.4842792749404907, "objective/rlhf_reward": -1.5371170997619625, "objective/scores": 1.1, "policy/approxkl_avg": 7.890674591064453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.677734375, "step": 592, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981930255889893 }, { "episode": 9504, "epoch": 0.05694359564295215, "loss/policy_avg": 0.05721379816532135, "lr": 9.621037832310838e-06, "objective/entropy": -257.69232177734375, "objective/kl": 23.966060638427734, "objective/non_score_reward": -1.19830322265625, "objective/rlhf_reward": -3.0598793412248293, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 20.133102416992188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.732421875, "step": 593, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.001232624053955 }, { "episode": 9520, "epoch": 0.0570394602820817, "loss/policy_avg": 0.5772296786308289, "lr": 9.620398773006135e-06, "objective/entropy": -89.6330795288086, "objective/kl": 31.078372955322266, "objective/non_score_reward": -1.5539186000823975, "objective/rlhf_reward": -4.734722021038889, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 21.1763916015625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.87109375, "step": 594, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000513792037964 }, { "episode": 9536, "epoch": 0.05713532492121125, "loss/policy_avg": -0.026315592229366302, "lr": 9.619759713701432e-06, "objective/entropy": -219.30979919433594, "objective/kl": 26.461135864257812, "objective/non_score_reward": -1.323056697845459, "objective/rlhf_reward": -3.9329772827371787, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 8.585318565368652, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.677734375, "step": 595, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0008788108825684 }, { "episode": 9552, "epoch": 0.057231189560340796, "loss/policy_avg": 0.2548080384731293, "lr": 9.619120654396729e-06, "objective/entropy": -37.27716827392578, "objective/kl": 44.03446960449219, "objective/non_score_reward": -2.201723575592041, "objective/rlhf_reward": -7.356295923800811, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 21.06201934814453, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.63671875, "step": 596, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991514682769775 }, { "episode": 9568, "epoch": 0.057327054199470345, "loss/policy_avg": 2.5911049842834473, "lr": 9.618481595092026e-06, "objective/entropy": -171.7782745361328, "objective/kl": 20.800029754638672, "objective/non_score_reward": -1.0400015115737915, "objective/rlhf_reward": -1.760006046295166, "objective/scores": 0.6, "policy/approxkl_avg": 2.9469943046569824, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484375, "step": 597, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.014529228210449 }, { "episode": 9584, "epoch": 0.057422918838599894, "loss/policy_avg": -0.1166892945766449, "lr": 9.617842535787323e-06, "objective/entropy": -109.67333221435547, "objective/kl": 34.37934494018555, "objective/non_score_reward": -1.7189671993255615, "objective/rlhf_reward": -6.8758686780929565, "objective/scores": 0.0, "policy/approxkl_avg": 17.377391815185547, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4794921875, "step": 598, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002119541168213 }, { "episode": 9600, "epoch": 0.05751878347772944, "loss/policy_avg": -0.15396325290203094, "lr": 9.617203476482618e-06, "objective/entropy": -128.05728149414062, "objective/kl": 29.42688751220703, "objective/non_score_reward": -1.4713443517684937, "objective/rlhf_reward": -4.060548658641886, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.408236026763916, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.71484375, "step": 599, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002739429473877 }, { "episode": 9616, "epoch": 0.05761464811685899, "loss/policy_avg": 0.14407247304916382, "lr": 9.616564417177915e-06, "objective/entropy": -272.3529357910156, "objective/kl": 21.596874237060547, "objective/non_score_reward": -1.0798437595367432, "objective/rlhf_reward": -1.3956560238611426, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.104412078857422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.58984375, "step": 600, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001608371734619 }, { "episode": 9632, "epoch": 0.05771051275598854, "loss/policy_avg": 0.20445303618907928, "lr": 9.615925357873211e-06, "objective/entropy": -291.0384521484375, "objective/kl": 28.06856918334961, "objective/non_score_reward": -1.403428554534912, "objective/rlhf_reward": -4.235111692038876, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 9.333198547363281, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.720703125, "step": 601, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0009381771087646 }, { "episode": 9648, "epoch": 0.05780637739511809, "loss/policy_avg": 0.7656448483467102, "lr": 9.615286298568508e-06, "objective/entropy": -4.355806350708008, "objective/kl": 34.863006591796875, "objective/non_score_reward": -1.7431503534317017, "objective/rlhf_reward": -5.548769433696833, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 10.645190238952637, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.591796875, "step": 602, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9971041679382324 }, { "episode": 9664, "epoch": 0.05790224203424764, "loss/policy_avg": 0.1100698709487915, "lr": 9.614647239263805e-06, "objective/entropy": -203.49618530273438, "objective/kl": 19.046649932861328, "objective/non_score_reward": -0.9523325562477112, "objective/rlhf_reward": -2.4307281161225855, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.499467670917511, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.638671875, "step": 603, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0019733905792236 }, { "episode": 9680, "epoch": 0.05799810667337719, "loss/policy_avg": 0.17878472805023193, "lr": 9.6140081799591e-06, "objective/entropy": -162.996826171875, "objective/kl": 23.458127975463867, "objective/non_score_reward": -1.172906517982483, "objective/rlhf_reward": -3.3661131596862504, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 8.434497833251953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5390625, "step": 604, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9986371994018555 }, { "episode": 9696, "epoch": 0.058093971312506744, "loss/policy_avg": 0.5608217716217041, "lr": 9.613369120654397e-06, "objective/entropy": -168.91802978515625, "objective/kl": 31.90495491027832, "objective/non_score_reward": -1.5952478647232056, "objective/rlhf_reward": -3.4572724446069927, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 10.658321380615234, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.57421875, "step": 605, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999322772026062 }, { "episode": 9712, "epoch": 0.05818983595163629, "loss/policy_avg": 0.10194225609302521, "lr": 9.612730061349694e-06, "objective/entropy": -138.00286865234375, "objective/kl": 34.8355712890625, "objective/non_score_reward": -1.7417783737182617, "objective/rlhf_reward": -5.641600999861879, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 19.823665618896484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.765625, "step": 606, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000347852706909 }, { "episode": 9728, "epoch": 0.05828570059076584, "loss/policy_avg": 1.170401930809021, "lr": 9.612091002044991e-06, "objective/entropy": -171.179443359375, "objective/kl": 23.883764266967773, "objective/non_score_reward": -1.1941882371902466, "objective/rlhf_reward": -3.2609813449704017, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.674392819404602, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.654296875, "step": 607, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004193782806396 }, { "episode": 9744, "epoch": 0.05838156522989539, "loss/policy_avg": 0.05054464191198349, "lr": 9.611451942740288e-06, "objective/entropy": -196.56436157226562, "objective/kl": 23.218883514404297, "objective/non_score_reward": -1.1609442234039307, "objective/rlhf_reward": -3.1931789918855276, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 11.145727157592773, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.60546875, "step": 608, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998692512512207 }, { "episode": 9760, "epoch": 0.05847742986902494, "loss/policy_avg": 0.054385945200920105, "lr": 9.610812883435585e-06, "objective/entropy": -244.93141174316406, "objective/kl": 29.985477447509766, "objective/non_score_reward": -1.4992740154266357, "objective/rlhf_reward": -4.637845957015438, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 19.703460693359375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 609, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0000925064086914 }, { "episode": 9776, "epoch": 0.05857329450815449, "loss/policy_avg": -0.05685323104262352, "lr": 9.61017382413088e-06, "objective/entropy": -65.63417053222656, "objective/kl": 31.53623390197754, "objective/non_score_reward": -1.5768117904663086, "objective/rlhf_reward": -3.383528147579405, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.860790252685547, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.59375, "step": 610, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001614809036255 }, { "episode": 9792, "epoch": 0.05866915914728404, "loss/policy_avg": 0.20876801013946533, "lr": 9.609534764826177e-06, "objective/entropy": -112.53227996826172, "objective/kl": 41.12568664550781, "objective/non_score_reward": -2.0562844276428223, "objective/rlhf_reward": -5.825137710571289, "objective/scores": 0.6, "policy/approxkl_avg": 33.385337829589844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.626953125, "step": 611, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000166654586792 }, { "episode": 9808, "epoch": 0.05876502378641359, "loss/policy_avg": 0.2722185552120209, "lr": 9.608895705521472e-06, "objective/entropy": -124.71205139160156, "objective/kl": 38.9796257019043, "objective/non_score_reward": -1.9489812850952148, "objective/rlhf_reward": -5.395925498008728, "objective/scores": 0.6, "policy/approxkl_avg": 19.52260971069336, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.623046875, "step": 612, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9988607168197632 }, { "episode": 9824, "epoch": 0.058860888425543136, "loss/policy_avg": 0.7936792969703674, "lr": 9.608256646216769e-06, "objective/entropy": -150.9628448486328, "objective/kl": 32.946922302246094, "objective/non_score_reward": -1.6473462581634521, "objective/rlhf_reward": -5.165553171833125, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 23.228769302368164, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.537109375, "step": 613, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001022815704346 }, { "episode": 9840, "epoch": 0.058956753064672685, "loss/policy_avg": 0.8288295269012451, "lr": 9.607617586912066e-06, "objective/entropy": -145.37136840820312, "objective/kl": 37.17048645019531, "objective/non_score_reward": -1.8585245609283447, "objective/rlhf_reward": -5.6092691376534205, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 8.95422077178955, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.568359375, "step": 614, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995349645614624 }, { "episode": 9856, "epoch": 0.05905261770380223, "loss/policy_avg": 0.19199243187904358, "lr": 9.606978527607363e-06, "objective/entropy": -158.26043701171875, "objective/kl": 31.016521453857422, "objective/non_score_reward": -1.550826072692871, "objective/rlhf_reward": -4.8440544244989585, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.4004452228546143, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5546875, "step": 615, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00236439704895 }, { "episode": 9872, "epoch": 0.05914848234293178, "loss/policy_avg": 0.29752206802368164, "lr": 9.60633946830266e-06, "objective/entropy": -141.43800354003906, "objective/kl": 27.8808536529541, "objective/non_score_reward": -1.394042730331421, "objective/rlhf_reward": -3.842837558190028, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 10.629474639892578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5078125, "step": 616, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00028657913208 }, { "episode": 9888, "epoch": 0.05924434698206133, "loss/policy_avg": 0.2227097749710083, "lr": 9.605700408997955e-06, "objective/entropy": -97.0810775756836, "objective/kl": 34.3601188659668, "objective/non_score_reward": -1.718005895614624, "objective/rlhf_reward": -5.4481916024285235, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 16.432331085205078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.548828125, "step": 617, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975783824920654 }, { "episode": 9904, "epoch": 0.05934021162119088, "loss/policy_avg": 0.17975842952728271, "lr": 9.605061349693252e-06, "objective/entropy": -200.100830078125, "objective/kl": 28.51620864868164, "objective/non_score_reward": -1.4258103370666504, "objective/rlhf_reward": -3.8784127190438022, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.591612815856934, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 618, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0000221729278564 }, { "episode": 9920, "epoch": 0.05943607626032043, "loss/policy_avg": 0.4452857971191406, "lr": 9.604422290388548e-06, "objective/entropy": -87.9361572265625, "objective/kl": 34.174217224121094, "objective/non_score_reward": -1.7087109088897705, "objective/rlhf_reward": -5.278584449496821, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 24.203800201416016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.587890625, "step": 619, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989922046661377 }, { "episode": 9936, "epoch": 0.05953194089944998, "loss/policy_avg": 0.31785786151885986, "lr": 9.603783231083845e-06, "objective/entropy": -56.93491744995117, "objective/kl": 34.28547286987305, "objective/non_score_reward": -1.7142736911773682, "objective/rlhf_reward": -5.032265897068094, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 12.636474609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.85546875, "step": 620, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.996368408203125 }, { "episode": 9952, "epoch": 0.05962780553857953, "loss/policy_avg": 0.6350647211074829, "lr": 9.603144171779142e-06, "objective/entropy": -129.3587188720703, "objective/kl": 41.710655212402344, "objective/non_score_reward": -2.0855326652526855, "objective/rlhf_reward": -6.219424667135749, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.748146057128906, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4775390625, "step": 621, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9979774951934814 }, { "episode": 9968, "epoch": 0.059723670177709076, "loss/policy_avg": 0.9843254089355469, "lr": 9.602505112474439e-06, "objective/entropy": -95.34288024902344, "objective/kl": 49.37370300292969, "objective/non_score_reward": -2.4686851501464844, "objective/rlhf_reward": -8.049912209781716, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 31.02006721496582, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4970703125, "step": 622, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9965976476669312 }, { "episode": 9984, "epoch": 0.059819534816838625, "loss/policy_avg": 0.6165390610694885, "lr": 9.601866053169734e-06, "objective/entropy": -100.56966400146484, "objective/kl": 33.22990036010742, "objective/non_score_reward": -1.6614950895309448, "objective/rlhf_reward": -5.286730491851253, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 13.85442066192627, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.61328125, "step": 623, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971177577972412 }, { "episode": 10000, "epoch": 0.059915399455968174, "loss/policy_avg": 0.3318287134170532, "lr": 9.601226993865031e-06, "objective/entropy": -212.1555938720703, "objective/kl": 25.822668075561523, "objective/non_score_reward": -1.2911334037780762, "objective/rlhf_reward": -2.2408145412218303, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.2788864374160767, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.525390625, "step": 624, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995461702346802 }, { "episode": 10016, "epoch": 0.06001126409509772, "loss/policy_avg": 0.35671815276145935, "lr": 9.600587934560328e-06, "objective/entropy": -96.60403442382812, "objective/kl": 42.28247833251953, "objective/non_score_reward": -2.114124059677124, "objective/rlhf_reward": -6.6316679671135645, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 9.525958061218262, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.55859375, "step": 625, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999858021736145 }, { "episode": 10032, "epoch": 0.06010712873422727, "loss/policy_avg": 0.026430530473589897, "lr": 9.599948875255625e-06, "objective/entropy": -96.45112609863281, "objective/kl": 30.055763244628906, "objective/non_score_reward": -1.5027881860733032, "objective/rlhf_reward": -4.56055448493515, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.234503746032715, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.521484375, "step": 626, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002078056335449 }, { "episode": 10048, "epoch": 0.06020299337335682, "loss/policy_avg": -0.07770150899887085, "lr": 9.599309815950922e-06, "objective/entropy": -78.50785827636719, "objective/kl": 33.19765090942383, "objective/non_score_reward": -1.6598827838897705, "objective/rlhf_reward": -5.158578279431223, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 60.745849609375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5546875, "step": 627, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0026586055755615 }, { "episode": 10064, "epoch": 0.06029885801248637, "loss/policy_avg": 0.045525066554546356, "lr": 9.598670756646217e-06, "objective/entropy": -207.98727416992188, "objective/kl": 34.44676208496094, "objective/non_score_reward": -1.7223379611968994, "objective/rlhf_reward": -5.489351963996887, "objective/scores": 0.35, "policy/approxkl_avg": 2.952592372894287, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.671875, "step": 628, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989871978759766 }, { "episode": 10080, "epoch": 0.06039472265161592, "loss/policy_avg": 0.32521092891693115, "lr": 9.598031697341514e-06, "objective/entropy": -71.00718688964844, "objective/kl": 27.00582504272461, "objective/non_score_reward": -1.3502912521362305, "objective/rlhf_reward": -3.977332849701015, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.865281105041504, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7578125, "step": 629, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001133918762207 }, { "episode": 10096, "epoch": 0.06049058729074547, "loss/policy_avg": 0.22257700562477112, "lr": 9.59739263803681e-06, "objective/entropy": -87.40052795410156, "objective/kl": 31.356922149658203, "objective/non_score_reward": -1.5678460597991943, "objective/rlhf_reward": -4.32397324867719, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 27.549453735351562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.546875, "step": 630, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999577522277832 }, { "episode": 10112, "epoch": 0.06058645192987502, "loss/policy_avg": 0.4591647982597351, "lr": 9.596753578732108e-06, "objective/entropy": -35.01010513305664, "objective/kl": 28.93059539794922, "objective/non_score_reward": -1.4465298652648926, "objective/rlhf_reward": -4.42686941597311, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 10.006196975708008, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8203125, "step": 631, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9968595504760742 }, { "episode": 10128, "epoch": 0.060682316569004566, "loss/policy_avg": 0.9483177661895752, "lr": 9.596114519427405e-06, "objective/entropy": -152.91030883789062, "objective/kl": 30.360069274902344, "objective/non_score_reward": -1.5180034637451172, "objective/rlhf_reward": -4.338680283228555, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 15.410400390625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.599609375, "step": 632, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9943327903747559 }, { "episode": 10144, "epoch": 0.060778181208134115, "loss/policy_avg": 0.4167541265487671, "lr": 9.595475460122701e-06, "objective/entropy": -154.04684448242188, "objective/kl": 33.39550018310547, "objective/non_score_reward": -1.6697750091552734, "objective/rlhf_reward": -5.074980471197682, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 53.406578063964844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.576171875, "step": 633, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9963878393173218 }, { "episode": 10160, "epoch": 0.060874045847263664, "loss/policy_avg": -0.021846026182174683, "lr": 9.594836400817997e-06, "objective/entropy": -22.81509780883789, "objective/kl": 23.709880828857422, "objective/non_score_reward": -1.1854941844940186, "objective/rlhf_reward": -2.917147810730051, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 0.839837908744812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6953125, "step": 634, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000304937362671 }, { "episode": 10176, "epoch": 0.06096991048639321, "loss/policy_avg": 0.014755940064787865, "lr": 9.594197341513293e-06, "objective/entropy": -198.07839965820312, "objective/kl": 21.79191017150879, "objective/non_score_reward": -1.0895954370498657, "objective/rlhf_reward": -1.9583818078041078, "objective/scores": 0.6, "policy/approxkl_avg": 0.6484163999557495, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.767578125, "step": 635, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0002074241638184 }, { "episode": 10192, "epoch": 0.06106577512552276, "loss/policy_avg": 0.13533297181129456, "lr": 9.593558282208589e-06, "objective/entropy": -201.26246643066406, "objective/kl": 26.135250091552734, "objective/non_score_reward": -1.3067626953125, "objective/rlhf_reward": -3.885414889364868, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 11.92165756225586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.740234375, "step": 636, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993889331817627 }, { "episode": 10208, "epoch": 0.06116163976465231, "loss/policy_avg": 0.4021642506122589, "lr": 9.592919222903886e-06, "objective/entropy": -286.0339050292969, "objective/kl": 14.542181968688965, "objective/non_score_reward": -0.7271090745925903, "objective/rlhf_reward": -1.484604258735744, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.031335353851318, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.701171875, "step": 637, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.003591775894165 }, { "episode": 10224, "epoch": 0.06125750440378186, "loss/policy_avg": 0.2514651417732239, "lr": 9.592280163599182e-06, "objective/entropy": -132.75355529785156, "objective/kl": 25.25128173828125, "objective/non_score_reward": -1.2625641822814941, "objective/rlhf_reward": -3.5996581717446894, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 14.74315071105957, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.712890625, "step": 638, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000126361846924 }, { "episode": 10240, "epoch": 0.06135336904291141, "loss/policy_avg": 0.012995198369026184, "lr": 9.59164110429448e-06, "objective/entropy": -181.2290496826172, "objective/kl": 22.253154754638672, "objective/non_score_reward": -1.1126577854156494, "objective/rlhf_reward": -3.026798923214046, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 0.9591898918151855, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53515625, "step": 639, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993176460266113 }, { "episode": 10256, "epoch": 0.06144923368204096, "loss/policy_avg": 0.15271592140197754, "lr": 9.591002044989776e-06, "objective/entropy": -105.57412719726562, "objective/kl": 38.59171676635742, "objective/non_score_reward": -1.9295859336853027, "objective/rlhf_reward": -6.16208431026037, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.626259803771973, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.734375, "step": 640, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996435284614563 }, { "episode": 10272, "epoch": 0.061545098321170506, "loss/policy_avg": -0.11524446308612823, "lr": 9.590362985685071e-06, "objective/entropy": -123.53447723388672, "objective/kl": 26.7266845703125, "objective/non_score_reward": -1.336334228515625, "objective/rlhf_reward": -3.222630920187507, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.8472533226013184, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46484375, "step": 641, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.011606216430664 }, { "episode": 10288, "epoch": 0.061640962960300055, "loss/policy_avg": 0.4013972282409668, "lr": 9.589723926380368e-06, "objective/entropy": -128.90103149414062, "objective/kl": 31.007064819335938, "objective/non_score_reward": -1.5503532886505127, "objective/rlhf_reward": -4.685641431602177, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.671117782592773, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.556640625, "step": 642, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9970167875289917 }, { "episode": 10304, "epoch": 0.061736827599429604, "loss/policy_avg": 0.7907944321632385, "lr": 9.589084867075665e-06, "objective/entropy": -58.220497131347656, "objective/kl": 41.770606994628906, "objective/non_score_reward": -2.0885305404663086, "objective/rlhf_reward": -6.620788232485452, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 17.74094581604004, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.466796875, "step": 643, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.995949149131775 }, { "episode": 10320, "epoch": 0.06183269223855915, "loss/policy_avg": 0.017528323456645012, "lr": 9.588445807770962e-06, "objective/entropy": -208.79119873046875, "objective/kl": 23.041034698486328, "objective/non_score_reward": -1.1520518064498901, "objective/rlhf_reward": -3.092435383590397, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.83624267578125, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.732421875, "step": 644, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0010547637939453 }, { "episode": 10336, "epoch": 0.0619285568776887, "loss/policy_avg": 0.15500307083129883, "lr": 9.587806748466259e-06, "objective/entropy": -124.78570556640625, "objective/kl": 34.243202209472656, "objective/non_score_reward": -1.7121602296829224, "objective/rlhf_reward": -3.92492190444586, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.4558181762695312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5078125, "step": 645, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9997308254241943 }, { "episode": 10352, "epoch": 0.06202442151681825, "loss/policy_avg": 0.2161247283220291, "lr": 9.587167689161556e-06, "objective/entropy": -163.63064575195312, "objective/kl": 25.873336791992188, "objective/non_score_reward": -1.293666958808899, "objective/rlhf_reward": -3.7960657263673365, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 8.89102840423584, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5234375, "step": 646, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998837947845459 }, { "episode": 10368, "epoch": 0.0621202861559478, "loss/policy_avg": 0.08966261148452759, "lr": 9.586528629856851e-06, "objective/entropy": -104.2444076538086, "objective/kl": 33.29509735107422, "objective/non_score_reward": -1.664754867553711, "objective/rlhf_reward": -4.925686256090799, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.3677499294281006, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 647, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993959665298462 }, { "episode": 10384, "epoch": 0.06221615079507735, "loss/policy_avg": -0.02724701538681984, "lr": 9.585889570552148e-06, "objective/entropy": -133.99429321289062, "objective/kl": 27.543067932128906, "objective/non_score_reward": -1.3771533966064453, "objective/rlhf_reward": -3.1086136460304257, "objective/scores": 0.6, "policy/approxkl_avg": 7.215035438537598, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.677734375, "step": 648, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986003637313843 }, { "episode": 10400, "epoch": 0.0623120154342069, "loss/policy_avg": -0.23539991676807404, "lr": 9.585250511247445e-06, "objective/entropy": -167.906494140625, "objective/kl": 25.879772186279297, "objective/non_score_reward": -1.293988585472107, "objective/rlhf_reward": -3.571834478441792, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.0954341888427734, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.57421875, "step": 649, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997683763504028 }, { "episode": 10416, "epoch": 0.06240788007333645, "loss/policy_avg": 0.30569222569465637, "lr": 9.584611451942742e-06, "objective/entropy": -226.60678100585938, "objective/kl": 28.675113677978516, "objective/non_score_reward": -1.433755874633789, "objective/rlhf_reward": -3.7876121503877, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 52.77922058105469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6484375, "step": 650, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9980084896087646 }, { "episode": 10432, "epoch": 0.062503744712466, "loss/policy_avg": -0.24214023351669312, "lr": 9.583972392638038e-06, "objective/entropy": -121.17498779296875, "objective/kl": 38.84062957763672, "objective/non_score_reward": -1.9420316219329834, "objective/rlhf_reward": -5.820715139584477, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.8967432975769043, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.623046875, "step": 651, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0004329681396484 }, { "episode": 10448, "epoch": 0.06259960935159554, "loss/policy_avg": -0.3156575858592987, "lr": 9.583333333333335e-06, "objective/entropy": -146.38143920898438, "objective/kl": 32.020687103271484, "objective/non_score_reward": -1.60103440284729, "objective/rlhf_reward": -5.062502017527251, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.199296236038208, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.642578125, "step": 652, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0025553703308105 }, { "episode": 10464, "epoch": 0.0626954739907251, "loss/policy_avg": 0.07271748781204224, "lr": 9.58269427402863e-06, "objective/entropy": -196.48562622070312, "objective/kl": 28.001068115234375, "objective/non_score_reward": -1.4000535011291504, "objective/rlhf_reward": -4.2002141833305355, "objective/scores": 0.35, "policy/approxkl_avg": 24.475753784179688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6875, "step": 653, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0010745525360107 }, { "episode": 10480, "epoch": 0.06279133862985464, "loss/policy_avg": 0.17373695969581604, "lr": 9.582055214723927e-06, "objective/entropy": -275.5335388183594, "objective/kl": 27.79926300048828, "objective/non_score_reward": -1.3899632692337036, "objective/rlhf_reward": -5.5598530769348145, "objective/scores": 0.0, "policy/approxkl_avg": 17.22200584411621, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.615234375, "step": 654, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987027645111084 }, { "episode": 10496, "epoch": 0.0628872032689842, "loss/policy_avg": 0.15186084806919098, "lr": 9.581416155419224e-06, "objective/entropy": -197.2568817138672, "objective/kl": 23.105377197265625, "objective/non_score_reward": -1.1552690267562866, "objective/rlhf_reward": -2.796247239383768, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 35.64599609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7734375, "step": 655, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985601902008057 }, { "episode": 10512, "epoch": 0.06298306790811374, "loss/policy_avg": 0.09821736067533493, "lr": 9.58077709611452e-06, "objective/entropy": -192.20767211914062, "objective/kl": 28.659635543823242, "objective/non_score_reward": -1.4329817295074463, "objective/rlhf_reward": -4.070067649305449, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.6847333908081055, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.712890625, "step": 656, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977080821990967 }, { "episode": 10528, "epoch": 0.0630789325472433, "loss/policy_avg": 0.24115119874477386, "lr": 9.580138036809816e-06, "objective/entropy": -171.08619689941406, "objective/kl": 26.453920364379883, "objective/non_score_reward": -1.3226962089538574, "objective/rlhf_reward": -3.8907844781875607, "objective/scores": 0.35, "policy/approxkl_avg": 11.276920318603516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6875, "step": 657, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999925136566162 }, { "episode": 10544, "epoch": 0.06317479718637284, "loss/policy_avg": -0.04878993332386017, "lr": 9.579498977505113e-06, "objective/entropy": -95.69158172607422, "objective/kl": 26.445575714111328, "objective/non_score_reward": -1.3222787380218506, "objective/rlhf_reward": -3.94747917941156, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 5.285589218139648, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.625, "step": 658, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0013298988342285 }, { "episode": 10560, "epoch": 0.0632706618255024, "loss/policy_avg": -0.10105658322572708, "lr": 9.57885991820041e-06, "objective/entropy": -209.01065063476562, "objective/kl": 27.234224319458008, "objective/non_score_reward": -1.3617112636566162, "objective/rlhf_reward": -4.046844816207885, "objective/scores": 0.35, "policy/approxkl_avg": 2.436962366104126, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6328125, "step": 659, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001246690750122 }, { "episode": 10576, "epoch": 0.06336652646463194, "loss/policy_avg": -0.3218346834182739, "lr": 9.578220858895705e-06, "objective/entropy": -3.9748001098632812, "objective/kl": 18.186880111694336, "objective/non_score_reward": -0.9093440771102905, "objective/rlhf_reward": -1.5146698824324945, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 28.07345962524414, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8359375, "step": 660, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002238750457764 }, { "episode": 10592, "epoch": 0.06346239110376149, "loss/policy_avg": -0.19762462377548218, "lr": 9.577581799591002e-06, "objective/entropy": -204.72760009765625, "objective/kl": 18.785112380981445, "objective/non_score_reward": -0.9392555356025696, "objective/rlhf_reward": -1.6343160293259955, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.8940598964691162, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.560546875, "step": 661, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0025925636291504 }, { "episode": 10608, "epoch": 0.06355825574289103, "loss/policy_avg": -0.45743584632873535, "lr": 9.576942740286299e-06, "objective/entropy": -134.4844970703125, "objective/kl": 33.7373046875, "objective/non_score_reward": -1.6868653297424316, "objective/rlhf_reward": -5.296863298030242, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.153486967086792, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.521484375, "step": 662, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00048828125 }, { "episode": 10624, "epoch": 0.06365412038202059, "loss/policy_avg": 0.2565079629421234, "lr": 9.576303680981596e-06, "objective/entropy": -180.13528442382812, "objective/kl": 17.24534034729004, "objective/non_score_reward": -0.862267017364502, "objective/rlhf_reward": -2.089818143580837, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.433453559875488, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.626953125, "step": 663, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994475841522217 }, { "episode": 10640, "epoch": 0.06374998502115013, "loss/policy_avg": 0.17452527582645416, "lr": 9.575664621676893e-06, "objective/entropy": -64.2728271484375, "objective/kl": 21.405649185180664, "objective/non_score_reward": -1.0702824592590332, "objective/rlhf_reward": -2.9218800303682517, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.6351606845855713, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.71484375, "step": 664, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0012965202331543 }, { "episode": 10656, "epoch": 0.06384584966027969, "loss/policy_avg": 0.6966801881790161, "lr": 9.57502556237219e-06, "objective/entropy": -251.04238891601562, "objective/kl": 27.693851470947266, "objective/non_score_reward": -1.384692668914795, "objective/rlhf_reward": -3.934650454584675, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 10.390886306762695, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.5859375, "step": 665, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001401662826538 }, { "episode": 10672, "epoch": 0.06394171429940923, "loss/policy_avg": 0.16458481550216675, "lr": 9.574386503067485e-06, "objective/entropy": -219.99136352539062, "objective/kl": 13.308931350708008, "objective/non_score_reward": -0.6654465198516846, "objective/rlhf_reward": -0.7143749100732166, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 3.77976131439209, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 666, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000656843185425 }, { "episode": 10688, "epoch": 0.06403757893853879, "loss/policy_avg": -0.009436726570129395, "lr": 9.573747443762782e-06, "objective/entropy": -162.25047302246094, "objective/kl": 23.977962493896484, "objective/non_score_reward": -1.1988980770111084, "objective/rlhf_reward": -2.8481810791062667, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 22.450942993164062, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.763671875, "step": 667, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0019123554229736 }, { "episode": 10704, "epoch": 0.06413344357766833, "loss/policy_avg": 0.4135128855705261, "lr": 9.573108384458079e-06, "objective/entropy": -63.0797119140625, "objective/kl": 41.37904739379883, "objective/non_score_reward": -2.0689523220062256, "objective/rlhf_reward": -6.542475895086923, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 88.98745727539062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.779296875, "step": 668, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999923706054688 }, { "episode": 10720, "epoch": 0.06422930821679788, "loss/policy_avg": 0.6821532845497131, "lr": 9.572469325153375e-06, "objective/entropy": -196.7287139892578, "objective/kl": 30.88260269165039, "objective/non_score_reward": -1.5441300868988037, "objective/rlhf_reward": -4.660748505386051, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 23.963293075561523, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.78515625, "step": 669, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989004135131836 }, { "episode": 10736, "epoch": 0.06432517285592743, "loss/policy_avg": 0.3629915118217468, "lr": 9.571830265848672e-06, "objective/entropy": -205.541259765625, "objective/kl": 24.442432403564453, "objective/non_score_reward": -1.2221217155456543, "objective/rlhf_reward": -3.155153171221415, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 15.010305404663086, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.63671875, "step": 670, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991261959075928 }, { "episode": 10752, "epoch": 0.06442103749505698, "loss/policy_avg": 0.3024546504020691, "lr": 9.571191206543968e-06, "objective/entropy": -184.0182647705078, "objective/kl": 28.46197509765625, "objective/non_score_reward": -1.4230988025665283, "objective/rlhf_reward": -3.744983862118657, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.1509013175964355, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 671, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998319149017334 }, { "episode": 10768, "epoch": 0.06451690213418652, "loss/policy_avg": -0.12359270453453064, "lr": 9.570552147239264e-06, "objective/entropy": -107.1251220703125, "objective/kl": 24.85216522216797, "objective/non_score_reward": -1.2426085472106934, "objective/rlhf_reward": -3.611183964942379, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.815180540084839, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.62890625, "step": 672, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002232074737549 }, { "episode": 10784, "epoch": 0.06461276677331608, "loss/policy_avg": 0.3783743977546692, "lr": 9.569913087934561e-06, "objective/entropy": -155.0634765625, "objective/kl": 33.26643371582031, "objective/non_score_reward": -1.663321852684021, "objective/rlhf_reward": -5.294037544463558, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.487679958343506, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.751953125, "step": 673, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9973864555358887 }, { "episode": 10800, "epoch": 0.06470863141244562, "loss/policy_avg": 0.12491178512573242, "lr": 9.569274028629858e-06, "objective/entropy": -202.8880157470703, "objective/kl": 23.53227996826172, "objective/non_score_reward": -1.1766140460968018, "objective/rlhf_reward": -2.9731229106585184, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 5.709697246551514, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.576171875, "step": 674, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979543685913086 }, { "episode": 10816, "epoch": 0.06480449605157518, "loss/policy_avg": -0.01751142367720604, "lr": 9.568634969325155e-06, "objective/entropy": -217.27896118164062, "objective/kl": 27.020957946777344, "objective/non_score_reward": -1.3510478734970093, "objective/rlhf_reward": -3.4567805034684493, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 0.6378078460693359, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.63671875, "step": 675, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0012476444244385 }, { "episode": 10832, "epoch": 0.06490036069070472, "loss/policy_avg": 0.28126630187034607, "lr": 9.567995910020452e-06, "objective/entropy": -230.15963745117188, "objective/kl": 24.95879364013672, "objective/non_score_reward": -1.2479398250579834, "objective/rlhf_reward": -3.329899912298308, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 10.301782608032227, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.744140625, "step": 676, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993771314620972 }, { "episode": 10848, "epoch": 0.06499622532983428, "loss/policy_avg": 0.12287623435258865, "lr": 9.567356850715747e-06, "objective/entropy": -263.37542724609375, "objective/kl": 23.937744140625, "objective/non_score_reward": -1.1968872547149658, "objective/rlhf_reward": -0.3875493764877316, "objective/scores": 1.1, "policy/approxkl_avg": 45.05952453613281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.703125, "step": 677, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9995876550674438 }, { "episode": 10864, "epoch": 0.06509208996896382, "loss/policy_avg": 0.6470179557800293, "lr": 9.566717791411044e-06, "objective/entropy": -65.45881652832031, "objective/kl": 23.807559967041016, "objective/non_score_reward": -1.190378189086914, "objective/rlhf_reward": -3.419877028375297, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 10.65350341796875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.744140625, "step": 678, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999696969985962 }, { "episode": 10880, "epoch": 0.06518795460809337, "loss/policy_avg": 0.2790781855583191, "lr": 9.56607873210634e-06, "objective/entropy": -161.4605712890625, "objective/kl": 41.620460510253906, "objective/non_score_reward": -2.0810232162475586, "objective/rlhf_reward": -3.924092388153076, "objective/scores": 1.1, "policy/approxkl_avg": 5.482306480407715, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.57421875, "step": 679, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986392259597778 }, { "episode": 10896, "epoch": 0.06528381924722292, "loss/policy_avg": 0.042992569506168365, "lr": 9.565439672801636e-06, "objective/entropy": -162.92010498046875, "objective/kl": 26.902143478393555, "objective/non_score_reward": -1.3451071977615356, "objective/rlhf_reward": -4.001826503363949, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 4.27599573135376, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.654296875, "step": 680, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998090147972107 }, { "episode": 10912, "epoch": 0.06537968388635247, "loss/policy_avg": 0.20157073438167572, "lr": 9.564800613496933e-06, "objective/entropy": -265.3901672363281, "objective/kl": 29.956632614135742, "objective/non_score_reward": -1.4978315830230713, "objective/rlhf_reward": -3.868620397821937, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 68.22042846679688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.609375, "step": 681, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982101917266846 }, { "episode": 10928, "epoch": 0.06547554852548201, "loss/policy_avg": 1.519484281539917, "lr": 9.56416155419223e-06, "objective/entropy": -127.62720489501953, "objective/kl": 23.382505416870117, "objective/non_score_reward": -1.1691253185272217, "objective/rlhf_reward": -2.2765009164810177, "objective/scores": 0.6, "policy/approxkl_avg": 17.878856658935547, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.419921875, "step": 682, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984099864959717 }, { "episode": 10944, "epoch": 0.06557141316461157, "loss/policy_avg": 0.3158057928085327, "lr": 9.563522494887527e-06, "objective/entropy": -190.45260620117188, "objective/kl": 25.518230438232422, "objective/non_score_reward": -1.275911569595337, "objective/rlhf_reward": -3.622693660672068, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 34.12330627441406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.63671875, "step": 683, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000800371170044 }, { "episode": 10960, "epoch": 0.06566727780374111, "loss/policy_avg": 1.1294161081314087, "lr": 9.562883435582822e-06, "objective/entropy": -107.20721435546875, "objective/kl": 32.379913330078125, "objective/non_score_reward": -1.6189957857131958, "objective/rlhf_reward": -5.13434737017694, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 7.272080421447754, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5234375, "step": 684, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998885154724121 }, { "episode": 10976, "epoch": 0.06576314244287067, "loss/policy_avg": 0.44281357526779175, "lr": 9.562244376278119e-06, "objective/entropy": -128.0640869140625, "objective/kl": 20.03044891357422, "objective/non_score_reward": -1.0015225410461426, "objective/rlhf_reward": -1.082371120096418, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.73418140411377, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.634765625, "step": 685, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999821424484253 }, { "episode": 10992, "epoch": 0.06585900708200021, "loss/policy_avg": 0.2683737576007843, "lr": 9.561605316973416e-06, "objective/entropy": -258.8201904296875, "objective/kl": 27.295347213745117, "objective/non_score_reward": -1.3647674322128296, "objective/rlhf_reward": -2.535350595356199, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.86362886428833, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71875, "step": 686, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981340169906616 }, { "episode": 11008, "epoch": 0.06595487172112977, "loss/policy_avg": -0.14624132215976715, "lr": 9.560966257668713e-06, "objective/entropy": -96.99462890625, "objective/kl": 30.466350555419922, "objective/non_score_reward": -1.523317575454712, "objective/rlhf_reward": -4.57749816158646, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 8.779112815856934, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.494140625, "step": 687, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981212615966797 }, { "episode": 11024, "epoch": 0.06605073636025931, "loss/policy_avg": 0.12842759490013123, "lr": 9.56032719836401e-06, "objective/entropy": -166.20689392089844, "objective/kl": 26.250516891479492, "objective/non_score_reward": -1.312525749206543, "objective/rlhf_reward": -2.8501029968261715, "objective/scores": 0.6, "policy/approxkl_avg": 7.160890102386475, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5703125, "step": 688, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990025758743286 }, { "episode": 11040, "epoch": 0.06614660099938886, "loss/policy_avg": 0.2923339009284973, "lr": 9.559688139059306e-06, "objective/entropy": -236.72100830078125, "objective/kl": 33.81795883178711, "objective/non_score_reward": -1.6908979415893555, "objective/rlhf_reward": -5.4219562321001575, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 16.3193359375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.650390625, "step": 689, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.99735426902771 }, { "episode": 11056, "epoch": 0.0662424656385184, "loss/policy_avg": -0.10266150534152985, "lr": 9.559049079754601e-06, "objective/entropy": -85.62126159667969, "objective/kl": 31.331233978271484, "objective/non_score_reward": -1.5665616989135742, "objective/rlhf_reward": -4.143540324942146, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 6.518294811248779, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.794921875, "step": 690, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.006378650665283 }, { "episode": 11072, "epoch": 0.06633833027764796, "loss/policy_avg": 0.17208513617515564, "lr": 9.558410020449898e-06, "objective/entropy": -175.00662231445312, "objective/kl": 33.992698669433594, "objective/non_score_reward": -1.6996350288391113, "objective/rlhf_reward": -5.4392902490839194, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 32.03794860839844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.716796875, "step": 691, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989235401153564 }, { "episode": 11088, "epoch": 0.06643419491677752, "loss/policy_avg": 0.01335047371685505, "lr": 9.557770961145195e-06, "objective/entropy": -248.65049743652344, "objective/kl": 22.41885757446289, "objective/non_score_reward": -1.1209429502487183, "objective/rlhf_reward": -2.536360512452062, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.7352328300476074, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.673828125, "step": 692, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0011818408966064 }, { "episode": 11104, "epoch": 0.06653005955590706, "loss/policy_avg": 0.14417897164821625, "lr": 9.557131901840492e-06, "objective/entropy": -218.454345703125, "objective/kl": 15.86509895324707, "objective/non_score_reward": -0.7932549715042114, "objective/rlhf_reward": 1.226980143785477, "objective/scores": 1.1, "policy/approxkl_avg": 1.0328912734985352, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.640625, "step": 693, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0003151893615723 }, { "episode": 11120, "epoch": 0.06662592419503661, "loss/policy_avg": 0.09597369283437729, "lr": 9.556492842535789e-06, "objective/entropy": -175.68487548828125, "objective/kl": 32.48929977416992, "objective/non_score_reward": -1.624464988708496, "objective/rlhf_reward": -2.0978601336479183, "objective/scores": 1.1, "policy/approxkl_avg": 3.689056396484375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.705078125, "step": 694, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998678207397461 }, { "episode": 11136, "epoch": 0.06672178883416616, "loss/policy_avg": -0.004386359825730324, "lr": 9.555853783231084e-06, "objective/entropy": 122.54474639892578, "objective/kl": 42.134315490722656, "objective/non_score_reward": -2.106715679168701, "objective/rlhf_reward": -6.822743091646748, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 6.307683944702148, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7421875, "step": 695, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999419450759888 }, { "episode": 11152, "epoch": 0.06681765347329571, "loss/policy_avg": 0.3615373373031616, "lr": 9.555214723926381e-06, "objective/entropy": -260.84075927734375, "objective/kl": 35.725467681884766, "objective/non_score_reward": -1.7862732410430908, "objective/rlhf_reward": -5.664140108044505, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 45.438873291015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.685546875, "step": 696, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.996116280555725 }, { "episode": 11168, "epoch": 0.06691351811242525, "loss/policy_avg": 0.24602335691452026, "lr": 9.554575664621678e-06, "objective/entropy": -71.92741394042969, "objective/kl": 30.083784103393555, "objective/non_score_reward": -1.5041892528533936, "objective/rlhf_reward": -4.6575073835596275, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 5.438946723937988, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4765625, "step": 697, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998178482055664 }, { "episode": 11184, "epoch": 0.06700938275155481, "loss/policy_avg": 0.034039177000522614, "lr": 9.553936605316975e-06, "objective/entropy": -198.67774963378906, "objective/kl": 23.375925064086914, "objective/non_score_reward": -1.1687963008880615, "objective/rlhf_reward": -1.7514660700571265, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 0.5530495643615723, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.53125, "step": 698, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00034236907959 }, { "episode": 11200, "epoch": 0.06710524739068435, "loss/policy_avg": 0.5306535959243774, "lr": 9.553297546012272e-06, "objective/entropy": -143.43771362304688, "objective/kl": 35.411888122558594, "objective/non_score_reward": -1.7705943584442139, "objective/rlhf_reward": -5.63177965125595, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.416120529174805, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.66796875, "step": 699, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994237422943115 }, { "episode": 11216, "epoch": 0.06720111202981391, "loss/policy_avg": 0.2092888504266739, "lr": 9.552658486707569e-06, "objective/entropy": -169.036376953125, "objective/kl": 30.64543914794922, "objective/non_score_reward": -1.5322721004486084, "objective/rlhf_reward": -1.7290880441665646, "objective/scores": 1.1, "policy/approxkl_avg": 132.6121063232422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.564453125, "step": 700, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992992877960205 }, { "episode": 11232, "epoch": 0.06729697666894345, "loss/policy_avg": 0.2553282380104065, "lr": 9.552019427402864e-06, "objective/entropy": -145.8370361328125, "objective/kl": 31.58509063720703, "objective/non_score_reward": -1.5792546272277832, "objective/rlhf_reward": -4.760759084430292, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 23.342622756958008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.712890625, "step": 701, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0007388591766357 }, { "episode": 11248, "epoch": 0.067392841308073, "loss/policy_avg": 0.1272473782300949, "lr": 9.55138036809816e-06, "objective/entropy": -283.0919494628906, "objective/kl": 18.825233459472656, "objective/non_score_reward": -0.9412617683410645, "objective/rlhf_reward": -2.4057970878824424, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.5947492122650146, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70703125, "step": 702, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999638557434082 }, { "episode": 11264, "epoch": 0.06748870594720255, "loss/policy_avg": 0.2034430205821991, "lr": 9.550741308793456e-06, "objective/entropy": -274.40478515625, "objective/kl": 20.724695205688477, "objective/non_score_reward": -1.0362348556518555, "objective/rlhf_reward": -1.221220110298368, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.738941192626953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.61328125, "step": 703, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997761607170105 }, { "episode": 11280, "epoch": 0.0675845705863321, "loss/policy_avg": 0.7114033699035645, "lr": 9.550102249488753e-06, "objective/entropy": -135.6627960205078, "objective/kl": 27.718311309814453, "objective/non_score_reward": -1.3859155178070068, "objective/rlhf_reward": -3.5962508422898605, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 32.94233703613281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 704, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985225200653076 }, { "episode": 11296, "epoch": 0.06768043522546165, "loss/policy_avg": -0.08856553584337234, "lr": 9.54946319018405e-06, "objective/entropy": -172.419921875, "objective/kl": 31.078826904296875, "objective/non_score_reward": -1.553941249847412, "objective/rlhf_reward": -4.765167097659454, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 27.00151824951172, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.76171875, "step": 705, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003095865249634 }, { "episode": 11312, "epoch": 0.0677762998645912, "loss/policy_avg": -0.1016867533326149, "lr": 9.548824130879346e-06, "objective/entropy": -186.52476501464844, "objective/kl": 30.371601104736328, "objective/non_score_reward": -1.5185801982879639, "objective/rlhf_reward": -4.593368175442576, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 7.805020332336426, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.68359375, "step": 706, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0018138885498047 }, { "episode": 11328, "epoch": 0.06787216450372074, "loss/policy_avg": 0.3950710892677307, "lr": 9.548185071574643e-06, "objective/entropy": -169.30099487304688, "objective/kl": 26.604206085205078, "objective/non_score_reward": -1.3302103281021118, "objective/rlhf_reward": -3.9422390843308985, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.9309802055358887, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.650390625, "step": 707, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0003857612609863 }, { "episode": 11344, "epoch": 0.0679680291428503, "loss/policy_avg": 0.15957045555114746, "lr": 9.547546012269938e-06, "objective/entropy": -152.48211669921875, "objective/kl": 28.93355941772461, "objective/non_score_reward": -1.4466780424118042, "objective/rlhf_reward": -4.124852543295012, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 30.355663299560547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.728515625, "step": 708, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9971683025360107 }, { "episode": 11360, "epoch": 0.06806389378197984, "loss/policy_avg": 0.1635814905166626, "lr": 9.546906952965235e-06, "objective/entropy": -225.05284118652344, "objective/kl": 32.07009506225586, "objective/non_score_reward": -1.6035047769546509, "objective/rlhf_reward": -5.088506314784212, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 25.63396453857422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.708984375, "step": 709, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9975805282592773 }, { "episode": 11376, "epoch": 0.0681597584211094, "loss/policy_avg": 0.22918304800987244, "lr": 9.546267893660532e-06, "objective/entropy": -245.11099243164062, "objective/kl": 31.21074867248535, "objective/non_score_reward": -1.560537576675415, "objective/rlhf_reward": -4.5802904419308765, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.6522216796875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.658203125, "step": 710, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971773624420166 }, { "episode": 11392, "epoch": 0.06825562306023894, "loss/policy_avg": -0.15267148613929749, "lr": 9.545628834355829e-06, "objective/entropy": -26.006134033203125, "objective/kl": 25.76430320739746, "objective/non_score_reward": -1.288215160369873, "objective/rlhf_reward": -3.2054496509599044, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.9515511989593506, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.75, "step": 711, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0021400451660156 }, { "episode": 11408, "epoch": 0.0683514876993685, "loss/policy_avg": 0.03201477974653244, "lr": 9.544989775051126e-06, "objective/entropy": -229.9574737548828, "objective/kl": 31.691633224487305, "objective/non_score_reward": -1.5845816135406494, "objective/rlhf_reward": -4.887728492827758, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 81.25225830078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.74609375, "step": 712, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0012998580932617 }, { "episode": 11424, "epoch": 0.06844735233849804, "loss/policy_avg": 0.5598920583724976, "lr": 9.544350715746423e-06, "objective/entropy": -198.39407348632812, "objective/kl": 22.02547264099121, "objective/non_score_reward": -1.1012736558914185, "objective/rlhf_reward": -3.045844846700115, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.494403839111328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6640625, "step": 713, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001235008239746 }, { "episode": 11440, "epoch": 0.0685432169776276, "loss/policy_avg": 0.14270013570785522, "lr": 9.543711656441718e-06, "objective/entropy": -281.67730712890625, "objective/kl": 30.167518615722656, "objective/non_score_reward": -1.5083760023117065, "objective/rlhf_reward": -4.517732465060886, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 42.272212982177734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6171875, "step": 714, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981521368026733 }, { "episode": 11456, "epoch": 0.06863908161675714, "loss/policy_avg": 0.23854002356529236, "lr": 9.543072597137015e-06, "objective/entropy": -205.70501708984375, "objective/kl": 26.037616729736328, "objective/non_score_reward": -1.3018807172775269, "objective/rlhf_reward": -3.603402886454182, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 21.1671085357666, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.736328125, "step": 715, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999593734741211 }, { "episode": 11472, "epoch": 0.06873494625588669, "loss/policy_avg": 0.25810641050338745, "lr": 9.542433537832312e-06, "objective/entropy": -202.4583740234375, "objective/kl": 26.777297973632812, "objective/non_score_reward": -1.338865041732788, "objective/rlhf_reward": -3.7513400054612926, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.448478698730469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7890625, "step": 716, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999415636062622 }, { "episode": 11488, "epoch": 0.06883081089501623, "loss/policy_avg": 0.16866181790828705, "lr": 9.541794478527609e-06, "objective/entropy": -174.37855529785156, "objective/kl": 34.941444396972656, "objective/non_score_reward": -1.7470722198486328, "objective/rlhf_reward": -5.43202957412298, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 0.9149700403213501, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69140625, "step": 717, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000814437866211 }, { "episode": 11504, "epoch": 0.06892667553414579, "loss/policy_avg": 0.20718123018741608, "lr": 9.541155419222906e-06, "objective/entropy": -75.93595123291016, "objective/kl": 37.52787780761719, "objective/non_score_reward": -1.8763937950134277, "objective/rlhf_reward": -6.024622860367655, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.859286308288574, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 718, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990894794464111 }, { "episode": 11520, "epoch": 0.06902254017327533, "loss/policy_avg": -0.14078834652900696, "lr": 9.5405163599182e-06, "objective/entropy": -111.06301879882812, "objective/kl": 37.833980560302734, "objective/non_score_reward": -1.8916990756988525, "objective/rlhf_reward": -5.44408971287397, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.0138969421386719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.646484375, "step": 719, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0016212463378906 }, { "episode": 11536, "epoch": 0.06911840481240489, "loss/policy_avg": -0.02326921373605728, "lr": 9.539877300613498e-06, "objective/entropy": -7.474525451660156, "objective/kl": 37.21611785888672, "objective/non_score_reward": -1.860805869102478, "objective/rlhf_reward": -7.443223357200623, "objective/scores": 0.0, "policy/approxkl_avg": 0.989769458770752, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5546875, "step": 720, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0015361309051514 }, { "episode": 11552, "epoch": 0.06921426945153443, "loss/policy_avg": 0.9960123896598816, "lr": 9.539238241308795e-06, "objective/entropy": -102.21640014648438, "objective/kl": 29.624881744384766, "objective/non_score_reward": -1.4812440872192383, "objective/rlhf_reward": -3.9775650007294967, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.5700416564941406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.568359375, "step": 721, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000302314758301 }, { "episode": 11568, "epoch": 0.06931013409066399, "loss/policy_avg": -0.022494332864880562, "lr": 9.538599182004091e-06, "objective/entropy": -97.00556182861328, "objective/kl": 34.23220443725586, "objective/non_score_reward": -1.7116100788116455, "objective/rlhf_reward": -5.520927820235414, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 6.0028605461120605, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.625, "step": 722, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0014429092407227 }, { "episode": 11584, "epoch": 0.06940599872979353, "loss/policy_avg": 0.2970792055130005, "lr": 9.537960122699387e-06, "objective/entropy": -218.43130493164062, "objective/kl": 23.677339553833008, "objective/non_score_reward": -1.1838669776916504, "objective/rlhf_reward": -0.335467970371246, "objective/scores": 1.1, "policy/approxkl_avg": 35.85502624511719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.744140625, "step": 723, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9957175254821777 }, { "episode": 11600, "epoch": 0.06950186336892308, "loss/policy_avg": 0.09062906354665756, "lr": 9.537321063394683e-06, "objective/entropy": -145.62179565429688, "objective/kl": 19.510597229003906, "objective/non_score_reward": -0.9755299091339111, "objective/rlhf_reward": -2.560484102278381, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 5.657525539398193, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.671875, "step": 724, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002211809158325 }, { "episode": 11616, "epoch": 0.06959772800805263, "loss/policy_avg": 0.5650205612182617, "lr": 9.53668200408998e-06, "objective/entropy": -189.58197021484375, "objective/kl": 22.43151092529297, "objective/non_score_reward": -1.1215755939483643, "objective/rlhf_reward": -3.1446669011408384, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 16.189781188964844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.69140625, "step": 725, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9978678226470947 }, { "episode": 11632, "epoch": 0.06969359264718218, "loss/policy_avg": 0.10538655519485474, "lr": 9.536042944785277e-06, "objective/entropy": -262.17254638671875, "objective/kl": 21.21435546875, "objective/non_score_reward": -1.0607177019119263, "objective/rlhf_reward": -2.1201648137727123, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 5.554556846618652, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7578125, "step": 726, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994568824768066 }, { "episode": 11648, "epoch": 0.06978945728631172, "loss/policy_avg": 0.08264347910881042, "lr": 9.535403885480572e-06, "objective/entropy": -144.35389709472656, "objective/kl": 23.849288940429688, "objective/non_score_reward": -1.1924644708633423, "objective/rlhf_reward": -3.2889052657440896, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 0.2577582895755768, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.60546875, "step": 727, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009419918060303 }, { "episode": 11664, "epoch": 0.06988532192544128, "loss/policy_avg": -0.11442309617996216, "lr": 9.53476482617587e-06, "objective/entropy": -161.91555786132812, "objective/kl": 29.32978057861328, "objective/non_score_reward": -1.4664889574050903, "objective/rlhf_reward": -4.132622496287028, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 6.162350654602051, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.58203125, "step": 728, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0256590843200684 }, { "episode": 11680, "epoch": 0.06998118656457082, "loss/policy_avg": 0.15979725122451782, "lr": 9.534125766871166e-06, "objective/entropy": -46.392860412597656, "objective/kl": 34.71672058105469, "objective/non_score_reward": -1.7358360290527344, "objective/rlhf_reward": -5.601708403139739, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 26.208736419677734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.837890625, "step": 729, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9947504997253418 }, { "episode": 11696, "epoch": 0.07007705120370038, "loss/policy_avg": 0.01945001818239689, "lr": 9.533486707566463e-06, "objective/entropy": -199.32308959960938, "objective/kl": 20.052722930908203, "objective/non_score_reward": -1.002636194229126, "objective/rlhf_reward": -2.586712677677242, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.049467086791992, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.623046875, "step": 730, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999655485153198 }, { "episode": 11712, "epoch": 0.07017291584282992, "loss/policy_avg": 0.22911685705184937, "lr": 9.53284764826176e-06, "objective/entropy": -199.43820190429688, "objective/kl": 29.375852584838867, "objective/non_score_reward": -1.4687926769256592, "objective/rlhf_reward": -3.4751707077026364, "objective/scores": 0.6, "policy/approxkl_avg": 1.4132235050201416, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.64453125, "step": 731, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989359378814697 }, { "episode": 11728, "epoch": 0.07026878048195948, "loss/policy_avg": 0.045667171478271484, "lr": 9.532208588957055e-06, "objective/entropy": -156.77005004882812, "objective/kl": 28.574951171875, "objective/non_score_reward": -1.4287474155426025, "objective/rlhf_reward": -4.110870037142353, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 11.299884796142578, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.587890625, "step": 732, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989423751831055 }, { "episode": 11744, "epoch": 0.07036464512108902, "loss/policy_avg": -0.07621235400438309, "lr": 9.531569529652352e-06, "objective/entropy": -211.5927734375, "objective/kl": 25.139881134033203, "objective/non_score_reward": -1.2569940090179443, "objective/rlhf_reward": -3.2031475856629124, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.0796079635620117, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.548828125, "step": 733, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00140118598938 }, { "episode": 11760, "epoch": 0.07046050976021857, "loss/policy_avg": 0.3665542006492615, "lr": 9.530930470347649e-06, "objective/entropy": -136.42066955566406, "objective/kl": 28.39642333984375, "objective/non_score_reward": -1.4198211431503296, "objective/rlhf_reward": -5.679284453392029, "objective/scores": 0.0, "policy/approxkl_avg": 2.8006393909454346, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.673828125, "step": 734, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988253116607666 }, { "episode": 11776, "epoch": 0.07055637439934812, "loss/policy_avg": -0.16624964773654938, "lr": 9.530291411042946e-06, "objective/entropy": -172.16896057128906, "objective/kl": 32.62467956542969, "objective/non_score_reward": -1.6312339305877686, "objective/rlhf_reward": -5.183300068884521, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.176142692565918, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.587890625, "step": 735, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0014700889587402 }, { "episode": 11792, "epoch": 0.07065223903847767, "loss/policy_avg": -0.01751716434955597, "lr": 9.529652351738243e-06, "objective/entropy": -244.469970703125, "objective/kl": 21.34896469116211, "objective/non_score_reward": -1.0674481391906738, "objective/rlhf_reward": -1.346073900104734, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.2310829162597656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.73828125, "step": 736, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0015974044799805 }, { "episode": 11808, "epoch": 0.07074810367760721, "loss/policy_avg": -0.13727766275405884, "lr": 9.52901329243354e-06, "objective/entropy": -152.7752227783203, "objective/kl": 30.841548919677734, "objective/non_score_reward": -1.5420774221420288, "objective/rlhf_reward": -1.7683096885681149, "objective/scores": 1.1, "policy/approxkl_avg": 2.1432337760925293, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.732421875, "step": 737, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000338315963745 }, { "episode": 11824, "epoch": 0.07084396831673677, "loss/policy_avg": 0.24724145233631134, "lr": 9.528374233128835e-06, "objective/entropy": -249.35003662109375, "objective/kl": 41.97819519042969, "objective/non_score_reward": -2.098909854888916, "objective/rlhf_reward": -6.945041160197601, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 14.357757568359375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7421875, "step": 738, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984996318817139 }, { "episode": 11840, "epoch": 0.07093983295586631, "loss/policy_avg": -0.1166142150759697, "lr": 9.527735173824132e-06, "objective/entropy": 16.65149688720703, "objective/kl": 28.71587371826172, "objective/non_score_reward": -1.4357936382293701, "objective/rlhf_reward": -4.401539257078796, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.7607579231262207, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.64453125, "step": 739, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990723133087158 }, { "episode": 11856, "epoch": 0.07103569759499587, "loss/policy_avg": 0.035362888127565384, "lr": 9.527096114519428e-06, "objective/entropy": -227.2210235595703, "objective/kl": 27.349641799926758, "objective/non_score_reward": -1.36748206615448, "objective/rlhf_reward": -3.865808401171284, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 9.06348705291748, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6640625, "step": 740, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9996118545532227 }, { "episode": 11872, "epoch": 0.07113156223412541, "loss/policy_avg": 0.31989267468452454, "lr": 9.526457055214725e-06, "objective/entropy": -213.7845458984375, "objective/kl": 34.27381896972656, "objective/non_score_reward": -1.713691234588623, "objective/rlhf_reward": -4.732058527246986, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 26.892040252685547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.67578125, "step": 741, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0020368099212646 }, { "episode": 11888, "epoch": 0.07122742687325496, "loss/policy_avg": 0.18080441653728485, "lr": 9.525817995910022e-06, "objective/entropy": -164.34909057617188, "objective/kl": 29.15081024169922, "objective/non_score_reward": -1.457540512084961, "objective/rlhf_reward": -4.379564206214294, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 19.9893798828125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 742, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979908466339111 }, { "episode": 11904, "epoch": 0.0713232915123845, "loss/policy_avg": 0.06947439908981323, "lr": 9.525178936605317e-06, "objective/entropy": -35.78013610839844, "objective/kl": 30.88395118713379, "objective/non_score_reward": -1.5441975593566895, "objective/rlhf_reward": -4.620531051364496, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 9.903773307800293, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.658203125, "step": 743, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0000925064086914 }, { "episode": 11920, "epoch": 0.07141915615151406, "loss/policy_avg": 0.4868197441101074, "lr": 9.524539877300614e-06, "objective/entropy": -185.67857360839844, "objective/kl": 30.794139862060547, "objective/non_score_reward": -1.5397069454193115, "objective/rlhf_reward": -4.833315048247499, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 23.752399444580078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5703125, "step": 744, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9970831871032715 }, { "episode": 11936, "epoch": 0.0715150207906436, "loss/policy_avg": 0.4937871992588043, "lr": 9.52390081799591e-06, "objective/entropy": -196.15248107910156, "objective/kl": 32.130393981933594, "objective/non_score_reward": -1.6065199375152588, "objective/rlhf_reward": -5.084443858175903, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.993836402893066, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.509765625, "step": 745, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994457960128784 }, { "episode": 11952, "epoch": 0.07161088542977316, "loss/policy_avg": 0.10673123598098755, "lr": 9.523261758691206e-06, "objective/entropy": -74.68463134765625, "objective/kl": 34.281944274902344, "objective/non_score_reward": -1.7140971422195435, "objective/rlhf_reward": -3.9326697334062786, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.657389640808105, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4892578125, "step": 746, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998197078704834 }, { "episode": 11968, "epoch": 0.0717067500689027, "loss/policy_avg": 0.08303539454936981, "lr": 9.522622699386503e-06, "objective/entropy": -234.022705078125, "objective/kl": 26.956684112548828, "objective/non_score_reward": -1.3478342294692993, "objective/rlhf_reward": -3.26863074518827, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 9.614282608032227, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.77734375, "step": 747, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9977872371673584 }, { "episode": 11984, "epoch": 0.07180261470803226, "loss/policy_avg": 0.006275704130530357, "lr": 9.5219836400818e-06, "objective/entropy": -179.78111267089844, "objective/kl": 24.191059112548828, "objective/non_score_reward": -1.2095528841018677, "objective/rlhf_reward": -3.4789615509256553, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.5060572624206543, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48828125, "step": 748, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001844882965088 }, { "episode": 12000, "epoch": 0.07189847934716181, "loss/policy_avg": 0.05262988060712814, "lr": 9.521344580777097e-06, "objective/entropy": -61.52648162841797, "objective/kl": 24.345882415771484, "objective/non_score_reward": -1.2172942161560059, "objective/rlhf_reward": -2.7464705727258067, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 26.343456268310547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.734375, "step": 749, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996674060821533 }, { "episode": 12016, "epoch": 0.07199434398629136, "loss/policy_avg": 0.1489763706922531, "lr": 9.520705521472394e-06, "objective/entropy": -179.14523315429688, "objective/kl": 25.692440032958984, "objective/non_score_reward": -1.284622073173523, "objective/rlhf_reward": -3.19107700415128, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.4589556455612183, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.517578125, "step": 750, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989516735076904 }, { "episode": 12032, "epoch": 0.07209020862542091, "loss/policy_avg": 0.06708867847919464, "lr": 9.520066462167689e-06, "objective/entropy": -56.47541427612305, "objective/kl": 42.95630645751953, "objective/non_score_reward": -2.147815227508545, "objective/rlhf_reward": -6.7664322808113795, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 6.856327056884766, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.650390625, "step": 751, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9969167709350586 }, { "episode": 12048, "epoch": 0.07218607326455045, "loss/policy_avg": 0.3973958194255829, "lr": 9.519427402862986e-06, "objective/entropy": -244.11431884765625, "objective/kl": 25.62933921813965, "objective/non_score_reward": -1.2814669609069824, "objective/rlhf_reward": -3.301039035591196, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 48.01885223388672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.671875, "step": 752, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9983410835266113 }, { "episode": 12064, "epoch": 0.07228193790368001, "loss/policy_avg": 0.016892850399017334, "lr": 9.518788343558283e-06, "objective/entropy": -233.80613708496094, "objective/kl": 33.0050048828125, "objective/non_score_reward": -1.6502504348754883, "objective/rlhf_reward": -4.653590510563786, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 15.416328430175781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.560546875, "step": 753, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999025821685791 }, { "episode": 12080, "epoch": 0.07237780254280955, "loss/policy_avg": 0.10087546706199646, "lr": 9.51814928425358e-06, "objective/entropy": -283.5254211425781, "objective/kl": 25.051952362060547, "objective/non_score_reward": -1.2525975704193115, "objective/rlhf_reward": -2.6103905797004696, "objective/scores": 0.6, "policy/approxkl_avg": 19.29462432861328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 754, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9981729984283447 }, { "episode": 12096, "epoch": 0.07247366718193911, "loss/policy_avg": 0.24108710885047913, "lr": 9.517510224948877e-06, "objective/entropy": -211.13575744628906, "objective/kl": 35.66078186035156, "objective/non_score_reward": -1.7830390930175781, "objective/rlhf_reward": -5.708324392040339, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.15980339050293, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.60546875, "step": 755, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9982430934906006 }, { "episode": 12112, "epoch": 0.07256953182106865, "loss/policy_avg": 0.6718421578407288, "lr": 9.516871165644172e-06, "objective/entropy": -148.00872802734375, "objective/kl": 30.348403930664062, "objective/non_score_reward": -1.5174202919006348, "objective/rlhf_reward": -4.669681048393249, "objective/scores": 0.35, "policy/approxkl_avg": 24.264657974243164, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.626953125, "step": 756, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989105463027954 }, { "episode": 12128, "epoch": 0.0726653964601982, "loss/policy_avg": 0.17684796452522278, "lr": 9.516232106339469e-06, "objective/entropy": -220.75283813476562, "objective/kl": 18.81310272216797, "objective/non_score_reward": -0.9406551122665405, "objective/rlhf_reward": -2.3840183998025477, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.753880500793457, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.498046875, "step": 757, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9982280731201172 }, { "episode": 12144, "epoch": 0.07276126109932775, "loss/policy_avg": 0.5594636797904968, "lr": 9.515593047034765e-06, "objective/entropy": -182.7705535888672, "objective/kl": 19.829849243164062, "objective/non_score_reward": -0.991492509841919, "objective/rlhf_reward": 0.4340301394462589, "objective/scores": 1.1, "policy/approxkl_avg": 28.46674346923828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.572265625, "step": 758, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991915225982666 }, { "episode": 12160, "epoch": 0.0728571257384573, "loss/policy_avg": 0.6502060890197754, "lr": 9.514953987730062e-06, "objective/entropy": -112.33629608154297, "objective/kl": 39.52580642700195, "objective/non_score_reward": -1.9762903451919556, "objective/rlhf_reward": -5.78245514847425, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 4.3783769607543945, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 759, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975318908691406 }, { "episode": 12176, "epoch": 0.07295299037758685, "loss/policy_avg": 0.88495934009552, "lr": 9.51431492842536e-06, "objective/entropy": -201.14666748046875, "objective/kl": 27.90923309326172, "objective/non_score_reward": -1.3954615592956543, "objective/rlhf_reward": -4.240210583716064, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.1258697509765625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 760, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0020923614501953 }, { "episode": 12192, "epoch": 0.0730488550167164, "loss/policy_avg": 0.3271714448928833, "lr": 9.513675869120656e-06, "objective/entropy": -236.55361938476562, "objective/kl": 28.77971839904785, "objective/non_score_reward": -1.43898606300354, "objective/rlhf_reward": -4.35594413280487, "objective/scores": 0.35, "policy/approxkl_avg": 5.469420909881592, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6171875, "step": 761, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997791051864624 }, { "episode": 12208, "epoch": 0.07314471965584594, "loss/policy_avg": 0.0032866448163986206, "lr": 9.513036809815951e-06, "objective/entropy": -200.22227478027344, "objective/kl": 28.73204803466797, "objective/non_score_reward": -1.4366023540496826, "objective/rlhf_reward": -4.142289552752095, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 0.5752939581871033, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.587890625, "step": 762, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0023627281188965 }, { "episode": 12224, "epoch": 0.0732405842949755, "loss/policy_avg": 0.21868771314620972, "lr": 9.512397750511248e-06, "objective/entropy": -187.9447784423828, "objective/kl": 20.44854736328125, "objective/non_score_reward": -1.0224274396896362, "objective/rlhf_reward": -2.5739379761540255, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 11.789055824279785, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 763, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9979248046875 }, { "episode": 12240, "epoch": 0.07333644893410504, "loss/policy_avg": 0.3879333734512329, "lr": 9.511758691206545e-06, "objective/entropy": -267.96685791015625, "objective/kl": 28.91057586669922, "objective/non_score_reward": -1.4455287456512451, "objective/rlhf_reward": -3.8347037536668136, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.813044548034668, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6953125, "step": 764, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0009827613830566 }, { "episode": 12256, "epoch": 0.0734323135732346, "loss/policy_avg": 0.06569409370422363, "lr": 9.511119631901842e-06, "objective/entropy": -207.83352661132812, "objective/kl": 24.208805084228516, "objective/non_score_reward": -1.2104402780532837, "objective/rlhf_reward": -3.2855019261508733, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.039762258529663, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.658203125, "step": 765, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0008108615875244 }, { "episode": 12272, "epoch": 0.07352817821236414, "loss/policy_avg": 0.9109029769897461, "lr": 9.510480572597139e-06, "objective/entropy": -85.82101440429688, "objective/kl": 31.18517303466797, "objective/non_score_reward": -1.5592585802078247, "objective/rlhf_reward": -4.50370092789332, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.987689018249512, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.56640625, "step": 766, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997962236404419 }, { "episode": 12288, "epoch": 0.0736240428514937, "loss/policy_avg": 0.44006603956222534, "lr": 9.509841513292434e-06, "objective/entropy": -254.5596923828125, "objective/kl": 26.123559951782227, "objective/non_score_reward": -1.3061779737472534, "objective/rlhf_reward": -3.6684524109035284, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 13.005337715148926, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.552734375, "step": 767, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9978489875793457 }, { "episode": 12304, "epoch": 0.07371990749062324, "loss/policy_avg": 0.14191022515296936, "lr": 9.509202453987731e-06, "objective/entropy": -185.1569061279297, "objective/kl": 38.093666076660156, "objective/non_score_reward": -1.9046835899353027, "objective/rlhf_reward": -7.618734002113342, "objective/scores": 0.0, "policy/approxkl_avg": 60.80290603637695, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.640625, "step": 768, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9955925941467285 }, { "episode": 12320, "epoch": 0.0738157721297528, "loss/policy_avg": -0.31537145376205444, "lr": 9.508563394683026e-06, "objective/entropy": -164.9215087890625, "objective/kl": 30.594449996948242, "objective/non_score_reward": -1.5297224521636963, "objective/rlhf_reward": -4.63793725055015, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.2754226922988892, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6171875, "step": 769, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001909017562866 }, { "episode": 12336, "epoch": 0.07391163676888234, "loss/policy_avg": 0.034731436520814896, "lr": 9.507924335378323e-06, "objective/entropy": -200.43959045410156, "objective/kl": 36.4830436706543, "objective/non_score_reward": -1.8241522312164307, "objective/rlhf_reward": -5.8727765872078805, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.3153905868530273, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626953125, "step": 770, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0031118392944336 }, { "episode": 12352, "epoch": 0.07400750140801189, "loss/policy_avg": 0.29965466260910034, "lr": 9.50728527607362e-06, "objective/entropy": -168.58261108398438, "objective/kl": 34.881736755371094, "objective/non_score_reward": -1.7440869808197021, "objective/rlhf_reward": -5.314488296926605, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 12.419918060302734, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.744140625, "step": 771, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999511957168579 }, { "episode": 12368, "epoch": 0.07410336604714143, "loss/policy_avg": 0.5840628743171692, "lr": 9.506646216768917e-06, "objective/entropy": -149.50210571289062, "objective/kl": 26.40768051147461, "objective/non_score_reward": -1.3203840255737305, "objective/rlhf_reward": -3.8005837230042214, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 10.453241348266602, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.560546875, "step": 772, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990813732147217 }, { "episode": 12384, "epoch": 0.07419923068627099, "loss/policy_avg": -0.20146791636943817, "lr": 9.506007157464214e-06, "objective/entropy": -206.66688537597656, "objective/kl": 25.146541595458984, "objective/non_score_reward": -1.2573271989822388, "objective/rlhf_reward": -5.029308795928955, "objective/scores": 0.0, "policy/approxkl_avg": 55.61228561401367, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.728515625, "step": 773, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993964433670044 }, { "episode": 12400, "epoch": 0.07429509532540053, "loss/policy_avg": 2.0998456478118896, "lr": 9.50536809815951e-06, "objective/entropy": -135.09249877929688, "objective/kl": 26.86371612548828, "objective/non_score_reward": -1.3431859016418457, "objective/rlhf_reward": -3.922145526023254, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 7.190234184265137, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.548828125, "step": 774, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000576972961426 }, { "episode": 12416, "epoch": 0.07439095996453009, "loss/policy_avg": 0.024284163489937782, "lr": 9.504729038854806e-06, "objective/entropy": -269.6484375, "objective/kl": 21.226428985595703, "objective/non_score_reward": -1.061321496963501, "objective/rlhf_reward": -2.7946879669145197, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.07242488861084, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.66796875, "step": 775, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999997615814209 }, { "episode": 12432, "epoch": 0.07448682460365963, "loss/policy_avg": 0.03317616134881973, "lr": 9.504089979550103e-06, "objective/entropy": -234.43389892578125, "objective/kl": 27.79866600036621, "objective/non_score_reward": -1.3899333477020264, "objective/rlhf_reward": -3.8264001766840616, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.3638486862182617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.689453125, "step": 776, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996023178100586 }, { "episode": 12448, "epoch": 0.07458268924278919, "loss/policy_avg": 0.16213266551494598, "lr": 9.5034509202454e-06, "objective/entropy": -203.708740234375, "objective/kl": 38.612911224365234, "objective/non_score_reward": -1.9306457042694092, "objective/rlhf_reward": -6.271984438510284, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 1.698218584060669, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.64453125, "step": 777, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995825290679932 }, { "episode": 12464, "epoch": 0.07467855388191873, "loss/policy_avg": 0.2597602605819702, "lr": 9.502811860940696e-06, "objective/entropy": -250.4356231689453, "objective/kl": 30.581310272216797, "objective/non_score_reward": -1.529065489768982, "objective/rlhf_reward": -4.737660029021603, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 8.781853675842285, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.615234375, "step": 778, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989944696426392 }, { "episode": 12480, "epoch": 0.07477441852104828, "loss/policy_avg": -0.24061758816242218, "lr": 9.502172801635993e-06, "objective/entropy": -98.61205291748047, "objective/kl": 26.375612258911133, "objective/non_score_reward": -1.3187806606292725, "objective/rlhf_reward": -3.794169786389231, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.955351829528809, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.7265625, "step": 779, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.003558397293091 }, { "episode": 12496, "epoch": 0.07487028316017783, "loss/policy_avg": 0.48288995027542114, "lr": 9.50153374233129e-06, "objective/entropy": -230.7918701171875, "objective/kl": 37.52941131591797, "objective/non_score_reward": -1.8764704465866089, "objective/rlhf_reward": -6.024929526265025, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 12.408464431762695, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.68359375, "step": 780, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990174770355225 }, { "episode": 12512, "epoch": 0.07496614779930738, "loss/policy_avg": 0.27871203422546387, "lr": 9.500894683026585e-06, "objective/entropy": -159.85903930664062, "objective/kl": 25.038909912109375, "objective/non_score_reward": -1.2519454956054688, "objective/rlhf_reward": -2.607781863212585, "objective/scores": 0.6, "policy/approxkl_avg": 46.26438903808594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.658203125, "step": 781, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000070095062256 }, { "episode": 12528, "epoch": 0.07506201243843692, "loss/policy_avg": 0.06291055679321289, "lr": 9.500255623721882e-06, "objective/entropy": -163.0406494140625, "objective/kl": 27.101749420166016, "objective/non_score_reward": -1.3550875186920166, "objective/rlhf_reward": -4.061099970076961, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 13.61475658416748, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.63671875, "step": 782, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986028671264648 }, { "episode": 12544, "epoch": 0.07515787707756648, "loss/policy_avg": 0.07766500115394592, "lr": 9.499616564417179e-06, "objective/entropy": -264.68377685546875, "objective/kl": 26.38882827758789, "objective/non_score_reward": -1.319441556930542, "objective/rlhf_reward": -2.3540468558084697, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 29.816272735595703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 783, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993165731430054 }, { "episode": 12560, "epoch": 0.07525374171669602, "loss/policy_avg": -0.25779616832733154, "lr": 9.498977505112476e-06, "objective/entropy": -192.4373016357422, "objective/kl": 30.569807052612305, "objective/non_score_reward": -1.528490424156189, "objective/rlhf_reward": -4.5098417139688305, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.409776210784912, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.765625, "step": 784, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0031237602233887 }, { "episode": 12576, "epoch": 0.07534960635582558, "loss/policy_avg": -0.23182180523872375, "lr": 9.498338445807773e-06, "objective/entropy": -116.57367706298828, "objective/kl": 30.319534301757812, "objective/non_score_reward": -1.5159766674041748, "objective/rlhf_reward": -4.704657160972042, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.2308108806610107, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.3955078125, "step": 785, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001164436340332 }, { "episode": 12592, "epoch": 0.07544547099495512, "loss/policy_avg": 0.270114541053772, "lr": 9.497699386503068e-06, "objective/entropy": -213.6279296875, "objective/kl": 34.02395248413086, "objective/non_score_reward": -1.701197624206543, "objective/rlhf_reward": -3.8810713633310527, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.159467697143555, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.556640625, "step": 786, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999849557876587 }, { "episode": 12608, "epoch": 0.07554133563408467, "loss/policy_avg": 0.01593317836523056, "lr": 9.497060327198365e-06, "objective/entropy": -83.6307601928711, "objective/kl": 28.397233963012695, "objective/non_score_reward": -1.4198617935180664, "objective/rlhf_reward": -4.198494317944407, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 13.974614143371582, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.576171875, "step": 787, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9974963665008545 }, { "episode": 12624, "epoch": 0.07563720027321422, "loss/policy_avg": 0.122782863676548, "lr": 9.496421267893662e-06, "objective/entropy": -66.27203369140625, "objective/kl": 20.0443115234375, "objective/non_score_reward": -1.0022156238555908, "objective/rlhf_reward": -2.6302602673448146, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.128955364227295, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.78125, "step": 788, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002767562866211 }, { "episode": 12640, "epoch": 0.07573306491234377, "loss/policy_avg": 0.06789802759885788, "lr": 9.495782208588959e-06, "objective/entropy": -174.1296844482422, "objective/kl": 28.25243377685547, "objective/non_score_reward": -1.4126217365264893, "objective/rlhf_reward": -3.988627438963042, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 21.132152557373047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.802734375, "step": 789, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0010154247283936 }, { "episode": 12656, "epoch": 0.07582892955147331, "loss/policy_avg": 0.1666814684867859, "lr": 9.495143149284254e-06, "objective/entropy": -226.70257568359375, "objective/kl": 28.976097106933594, "objective/non_score_reward": -1.4488048553466797, "objective/rlhf_reward": -4.371387500961391, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.0613138675689697, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62890625, "step": 790, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994125366210938 }, { "episode": 12672, "epoch": 0.07592479419060287, "loss/policy_avg": 0.1284073442220688, "lr": 9.49450408997955e-06, "objective/entropy": -215.84002685546875, "objective/kl": 28.486852645874023, "objective/non_score_reward": -1.4243427515029907, "objective/rlhf_reward": -4.355735114126831, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 5.659012317657471, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.64453125, "step": 791, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002176761627197 }, { "episode": 12688, "epoch": 0.07602065882973241, "loss/policy_avg": -0.04723303020000458, "lr": 9.493865030674848e-06, "objective/entropy": -227.61280822753906, "objective/kl": 28.772476196289062, "objective/non_score_reward": -1.4386236667633057, "objective/rlhf_reward": -2.830775891185972, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.8349313735961914, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.609375, "step": 792, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002286911010742 }, { "episode": 12704, "epoch": 0.07611652346886197, "loss/policy_avg": -0.01974731869995594, "lr": 9.493225971370144e-06, "objective/entropy": -168.45291137695312, "objective/kl": 32.674957275390625, "objective/non_score_reward": -1.633747935295105, "objective/rlhf_reward": -5.209478828936739, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 0.8098639249801636, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 793, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0004916191101074 }, { "episode": 12720, "epoch": 0.07621238810799151, "loss/policy_avg": 0.3524478077888489, "lr": 9.49258691206544e-06, "objective/entropy": -170.04669189453125, "objective/kl": 35.1775016784668, "objective/non_score_reward": -1.7588751316070557, "objective/rlhf_reward": -5.479241101947382, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 4.70783805847168, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.658203125, "step": 794, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981024265289307 }, { "episode": 12736, "epoch": 0.07630825274712107, "loss/policy_avg": 0.14937232434749603, "lr": 9.491947852760736e-06, "objective/entropy": -258.00518798828125, "objective/kl": 30.382396697998047, "objective/non_score_reward": -1.5191197395324707, "objective/rlhf_reward": -4.472358975473957, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 8.522323608398438, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.68359375, "step": 795, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.00205659866333 }, { "episode": 12752, "epoch": 0.07640411738625061, "loss/policy_avg": 0.4101511240005493, "lr": 9.491308793456033e-06, "objective/entropy": -97.3719482421875, "objective/kl": 49.89447021484375, "objective/non_score_reward": -2.4947237968444824, "objective/rlhf_reward": -7.578894591331482, "objective/scores": 0.6, "policy/approxkl_avg": 19.377134323120117, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.498046875, "step": 796, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981412887573242 }, { "episode": 12768, "epoch": 0.07649998202538016, "loss/policy_avg": -0.0627971962094307, "lr": 9.49066973415133e-06, "objective/entropy": -110.8655776977539, "objective/kl": 44.73468017578125, "objective/non_score_reward": -2.23673415184021, "objective/rlhf_reward": -6.546936726570129, "objective/scores": 0.6, "policy/approxkl_avg": 5.804272651672363, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.806640625, "step": 797, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971017837524414 }, { "episode": 12784, "epoch": 0.0765958466645097, "loss/policy_avg": 0.3731452226638794, "lr": 9.490030674846627e-06, "objective/entropy": -15.07757568359375, "objective/kl": 24.15683364868164, "objective/non_score_reward": -1.2078416347503662, "objective/rlhf_reward": -2.4313664793968197, "objective/scores": 0.6, "policy/approxkl_avg": 5.745340347290039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.638671875, "step": 798, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993362426757812 }, { "episode": 12800, "epoch": 0.07669171130363926, "loss/policy_avg": 0.3336324691772461, "lr": 9.489391615541922e-06, "objective/entropy": -249.59414672851562, "objective/kl": 28.68617820739746, "objective/non_score_reward": -1.4343090057373047, "objective/rlhf_reward": -2.8135166510355205, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.9479708671569824, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.693359375, "step": 799, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993374347686768 }, { "episode": 12816, "epoch": 0.0767875759427688, "loss/policy_avg": 0.12261458486318588, "lr": 9.488752556237219e-06, "objective/entropy": -207.68580627441406, "objective/kl": 33.91386413574219, "objective/non_score_reward": -1.6956932544708252, "objective/rlhf_reward": -5.301820400174021, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 25.18114471435547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.625, "step": 800, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9971894025802612 }, { "episode": 12832, "epoch": 0.07688344058189836, "loss/policy_avg": 0.1192292720079422, "lr": 9.488113496932516e-06, "objective/entropy": -268.4300842285156, "objective/kl": 26.710205078125, "objective/non_score_reward": -1.3355103731155396, "objective/rlhf_reward": -4.000405719786316, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.064979553222656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.638671875, "step": 801, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994006156921387 }, { "episode": 12848, "epoch": 0.0769793052210279, "loss/policy_avg": 0.4274081587791443, "lr": 9.487474437627813e-06, "objective/entropy": -125.00625610351562, "objective/kl": 36.30561065673828, "objective/non_score_reward": -1.815280795097351, "objective/rlhf_reward": -5.901873194907589, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 9.215574264526367, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.55078125, "step": 802, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999779224395752 }, { "episode": 12864, "epoch": 0.07707516986015746, "loss/policy_avg": 0.02082793414592743, "lr": 9.48683537832311e-06, "objective/entropy": 49.048545837402344, "objective/kl": 31.830245971679688, "objective/non_score_reward": -1.5915122032165527, "objective/rlhf_reward": -4.915450672717437, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.6811680793762207, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4384765625, "step": 803, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994103908538818 }, { "episode": 12880, "epoch": 0.077171034499287, "loss/policy_avg": 0.1582624763250351, "lr": 9.486196319018407e-06, "objective/entropy": -110.25260925292969, "objective/kl": 31.00435447692871, "objective/non_score_reward": -1.550217866897583, "objective/rlhf_reward": -3.8008712291717526, "objective/scores": 0.6, "policy/approxkl_avg": 3.5253429412841797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.55859375, "step": 804, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0027151107788086 }, { "episode": 12896, "epoch": 0.07726689913841656, "loss/policy_avg": 0.09249435365200043, "lr": 9.485557259713702e-06, "objective/entropy": -203.63662719726562, "objective/kl": 31.04816436767578, "objective/non_score_reward": -1.552408218383789, "objective/rlhf_reward": -4.547773247182952, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.3485993146896362, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646484375, "step": 805, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999431848526001 }, { "episode": 12912, "epoch": 0.0773627637775461, "loss/policy_avg": 0.44563794136047363, "lr": 9.484918200408999e-06, "objective/entropy": -163.74508666992188, "objective/kl": 31.982746124267578, "objective/non_score_reward": -1.599137306213379, "objective/rlhf_reward": -3.472829972149107, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 87.72571563720703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.70703125, "step": 806, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001553535461426 }, { "episode": 12928, "epoch": 0.07745862841667565, "loss/policy_avg": -0.017649848014116287, "lr": 9.484279141104296e-06, "objective/entropy": -266.5451965332031, "objective/kl": 27.058134078979492, "objective/non_score_reward": -1.3529068231582642, "objective/rlhf_reward": -1.0116270542144772, "objective/scores": 1.1, "policy/approxkl_avg": 5.037982940673828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.654296875, "step": 807, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0039749145507812 }, { "episode": 12944, "epoch": 0.07755449305580521, "loss/policy_avg": 5.042888641357422, "lr": 9.483640081799592e-06, "objective/entropy": -212.65740966796875, "objective/kl": 24.790084838867188, "objective/non_score_reward": -1.2395042181015015, "objective/rlhf_reward": -3.4770642546967263, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 11.046760559082031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.533203125, "step": 808, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002431869506836 }, { "episode": 12960, "epoch": 0.07765035769493475, "loss/policy_avg": -0.07623002678155899, "lr": 9.48300102249489e-06, "objective/entropy": -167.7131805419922, "objective/kl": 31.204689025878906, "objective/non_score_reward": -1.5602343082427979, "objective/rlhf_reward": -4.790339152427062, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 5.110037803649902, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5234375, "step": 809, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989296197891235 }, { "episode": 12976, "epoch": 0.07774622233406431, "loss/policy_avg": 0.0697702169418335, "lr": 9.482361963190185e-06, "objective/entropy": -99.56057739257812, "objective/kl": 40.95980453491211, "objective/non_score_reward": -2.047990322113037, "objective/rlhf_reward": -5.268241856933805, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.0177828073501587, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.595703125, "step": 810, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999544620513916 }, { "episode": 12992, "epoch": 0.07784208697319385, "loss/policy_avg": 0.011765815317630768, "lr": 9.481722903885481e-06, "objective/entropy": -270.2078857421875, "objective/kl": 32.53266906738281, "objective/non_score_reward": -1.6266334056854248, "objective/rlhf_reward": -4.950274675098017, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 10.882495880126953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.685546875, "step": 811, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997708797454834 }, { "episode": 13008, "epoch": 0.0779379516123234, "loss/policy_avg": 0.4012794494628906, "lr": 9.481083844580777e-06, "objective/entropy": -139.22914123535156, "objective/kl": 37.05573272705078, "objective/non_score_reward": -1.8527867794036865, "objective/rlhf_reward": -5.586318249973367, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 210.83877563476562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6640625, "step": 812, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001636505126953 }, { "episode": 13024, "epoch": 0.07803381625145295, "loss/policy_avg": 0.2699980139732361, "lr": 9.480444785276073e-06, "objective/entropy": -196.59963989257812, "objective/kl": 30.699893951416016, "objective/non_score_reward": -1.5349947214126587, "objective/rlhf_reward": -3.7399788856506344, "objective/scores": 0.6, "policy/approxkl_avg": 2.332146167755127, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.619140625, "step": 813, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989255666732788 }, { "episode": 13040, "epoch": 0.0781296808905825, "loss/policy_avg": 0.20207370817661285, "lr": 9.47980572597137e-06, "objective/entropy": -267.2593994140625, "objective/kl": 33.34029006958008, "objective/non_score_reward": -1.6670145988464355, "objective/rlhf_reward": -5.342545185118837, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.632169246673584, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.564453125, "step": 814, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993088245391846 }, { "episode": 13056, "epoch": 0.07822554552971205, "loss/policy_avg": 0.1745888739824295, "lr": 9.479166666666667e-06, "objective/entropy": -108.20680236816406, "objective/kl": 35.203025817871094, "objective/non_score_reward": -1.7601512670516968, "objective/rlhf_reward": -5.484345762935236, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 8.32550048828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.736328125, "step": 815, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0030176639556885 }, { "episode": 13072, "epoch": 0.0783214101688416, "loss/policy_avg": 0.2600640654563904, "lr": 9.478527607361964e-06, "objective/entropy": -204.03048706054688, "objective/kl": 40.41114807128906, "objective/non_score_reward": -2.020557403564453, "objective/rlhf_reward": -6.74059360316339, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.140628814697266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.619140625, "step": 816, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0000810623168945 }, { "episode": 13088, "epoch": 0.07841727480797114, "loss/policy_avg": 0.5273202061653137, "lr": 9.477888548057261e-06, "objective/entropy": -241.156494140625, "objective/kl": 24.541404724121094, "objective/non_score_reward": -1.2270702123641968, "objective/rlhf_reward": -3.246421401918517, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.965031623840332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.61328125, "step": 817, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990627765655518 }, { "episode": 13104, "epoch": 0.0785131394471007, "loss/policy_avg": -0.09151424467563629, "lr": 9.477249488752556e-06, "objective/entropy": -219.21754455566406, "objective/kl": 31.261905670166016, "objective/non_score_reward": -1.5630953311920166, "objective/rlhf_reward": -4.4275525763359775, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.8227334022521973, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.833984375, "step": 818, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0036370754241943 }, { "episode": 13120, "epoch": 0.07860900408623024, "loss/policy_avg": 0.13953115046024323, "lr": 9.476610429447853e-06, "objective/entropy": -186.8937530517578, "objective/kl": 27.69632339477539, "objective/non_score_reward": -1.3848161697387695, "objective/rlhf_reward": -3.1392647981643673, "objective/scores": 0.6, "policy/approxkl_avg": 3.2056455612182617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 819, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.003466844558716 }, { "episode": 13136, "epoch": 0.0787048687253598, "loss/policy_avg": 0.6420396566390991, "lr": 9.47597137014315e-06, "objective/entropy": -134.00025939941406, "objective/kl": 22.993852615356445, "objective/non_score_reward": -1.1496926546096802, "objective/rlhf_reward": -2.651359389500554, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 3.607414722442627, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611328125, "step": 820, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000748634338379 }, { "episode": 13152, "epoch": 0.07880073336448934, "loss/policy_avg": 0.08356916159391403, "lr": 9.475332310838447e-06, "objective/entropy": -189.72003173828125, "objective/kl": 26.506973266601562, "objective/non_score_reward": -1.3253486156463623, "objective/rlhf_reward": -3.959759166746765, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.290050029754639, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 821, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9971721172332764 }, { "episode": 13168, "epoch": 0.0788965980036189, "loss/policy_avg": 0.11917827278375626, "lr": 9.474693251533744e-06, "objective/entropy": -207.30722045898438, "objective/kl": 35.41877746582031, "objective/non_score_reward": -1.7709391117095947, "objective/rlhf_reward": -5.683756327629089, "objective/scores": 0.35, "policy/approxkl_avg": 6.870448112487793, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.57421875, "step": 822, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9955227375030518 }, { "episode": 13184, "epoch": 0.07899246264274844, "loss/policy_avg": -0.3528624475002289, "lr": 9.474054192229039e-06, "objective/entropy": -138.19627380371094, "objective/kl": 27.491954803466797, "objective/non_score_reward": -1.3745976686477661, "objective/rlhf_reward": -4.156755199938446, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.025694727897644, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.52734375, "step": 823, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0017786026000977 }, { "episode": 13200, "epoch": 0.079088327281878, "loss/policy_avg": 0.3965766727924347, "lr": 9.473415132924336e-06, "objective/entropy": -244.5587921142578, "objective/kl": 28.361434936523438, "objective/non_score_reward": -1.4180717468261719, "objective/rlhf_reward": -3.549580337778602, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 22.821792602539062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.53515625, "step": 824, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.997540831565857 }, { "episode": 13216, "epoch": 0.07918419192100754, "loss/policy_avg": 0.183881938457489, "lr": 9.472776073619633e-06, "objective/entropy": -235.81063842773438, "objective/kl": 35.635047912597656, "objective/non_score_reward": -1.7817524671554565, "objective/rlhf_reward": -5.785373976736694, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 13.993101119995117, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 825, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987282752990723 }, { "episode": 13232, "epoch": 0.07928005656013709, "loss/policy_avg": 0.13472305238246918, "lr": 9.47213701431493e-06, "objective/entropy": -209.61251831054688, "objective/kl": 32.511722564697266, "objective/non_score_reward": -1.6255862712860107, "objective/rlhf_reward": -5.176832351714296, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 10.514575958251953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.693359375, "step": 826, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980220794677734 }, { "episode": 13248, "epoch": 0.07937592119926663, "loss/policy_avg": 0.28974202275276184, "lr": 9.471497955010226e-06, "objective/entropy": -277.55413818359375, "objective/kl": 23.343517303466797, "objective/non_score_reward": -1.1671757698059082, "objective/rlhf_reward": -4.668703377246857, "objective/scores": 0.0, "policy/approxkl_avg": 4.868777275085449, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.595703125, "step": 827, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984409809112549 }, { "episode": 13264, "epoch": 0.07947178583839619, "loss/policy_avg": 0.0649593323469162, "lr": 9.470858895705523e-06, "objective/entropy": -168.09161376953125, "objective/kl": 32.58544921875, "objective/non_score_reward": -1.6292723417282104, "objective/rlhf_reward": -5.001317584308323, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 13.682709693908691, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.630859375, "step": 828, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994233846664429 }, { "episode": 13280, "epoch": 0.07956765047752573, "loss/policy_avg": 0.01223127543926239, "lr": 9.470219836400818e-06, "objective/entropy": -24.52312469482422, "objective/kl": 37.070613861083984, "objective/non_score_reward": -1.8535306453704834, "objective/rlhf_reward": -5.963524679751739, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 1.4948031902313232, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.591796875, "step": 829, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0006215572357178 }, { "episode": 13296, "epoch": 0.07966351511665529, "loss/policy_avg": 0.08012821525335312, "lr": 9.469580777096115e-06, "objective/entropy": -222.74710083007812, "objective/kl": 29.31437873840332, "objective/non_score_reward": -1.4657189846038818, "objective/rlhf_reward": -5.862875819206238, "objective/scores": 0.0, "policy/approxkl_avg": 7.948197364807129, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.720703125, "step": 830, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999361515045166 }, { "episode": 13312, "epoch": 0.07975937975578483, "loss/policy_avg": 0.25244101881980896, "lr": 9.468941717791412e-06, "objective/entropy": -256.2400817871094, "objective/kl": 25.82564926147461, "objective/non_score_reward": -1.2912824153900146, "objective/rlhf_reward": -5.165129542350769, "objective/scores": 0.0, "policy/approxkl_avg": 25.767894744873047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.625, "step": 831, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9958332777023315 }, { "episode": 13328, "epoch": 0.07985524439491438, "loss/policy_avg": 0.20151713490486145, "lr": 9.468302658486709e-06, "objective/entropy": -176.53012084960938, "objective/kl": 31.989328384399414, "objective/non_score_reward": -1.5994665622711182, "objective/rlhf_reward": -4.94726787051712, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.573209762573242, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 832, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0001397132873535 }, { "episode": 13344, "epoch": 0.07995110903404393, "loss/policy_avg": 0.40637868642807007, "lr": 9.467663599182006e-06, "objective/entropy": -157.83944702148438, "objective/kl": 26.236248016357422, "objective/non_score_reward": -1.311812400817871, "objective/rlhf_reward": -2.847249662876129, "objective/scores": 0.6, "policy/approxkl_avg": 41.408966064453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69921875, "step": 833, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9979543685913086 }, { "episode": 13360, "epoch": 0.08004697367317348, "loss/policy_avg": 0.4117756485939026, "lr": 9.467024539877301e-06, "objective/entropy": -154.52528381347656, "objective/kl": 34.40885925292969, "objective/non_score_reward": -1.7204430103302002, "objective/rlhf_reward": -3.958053027034971, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.24909782409668, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.552734375, "step": 834, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996882677078247 }, { "episode": 13376, "epoch": 0.08014283831230302, "loss/policy_avg": 0.25968849658966064, "lr": 9.466385480572598e-06, "objective/entropy": -35.48725509643555, "objective/kl": 48.416969299316406, "objective/non_score_reward": -2.4208483695983887, "objective/rlhf_reward": -8.324143612121029, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.6608781814575195, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4736328125, "step": 835, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974427223205566 }, { "episode": 13392, "epoch": 0.08023870295143258, "loss/policy_avg": 0.6013174057006836, "lr": 9.465746421267893e-06, "objective/entropy": -131.218994140625, "objective/kl": 40.460113525390625, "objective/non_score_reward": -2.023005723953247, "objective/rlhf_reward": -6.267194564613412, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 5.2574968338012695, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.765625, "step": 836, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985507726669312 }, { "episode": 13408, "epoch": 0.08033456759056212, "loss/policy_avg": 0.024118170142173767, "lr": 9.46510736196319e-06, "objective/entropy": -219.2191162109375, "objective/kl": 37.4605712890625, "objective/non_score_reward": -1.8730283975601196, "objective/rlhf_reward": -5.544702480511601, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.715839385986328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.603515625, "step": 837, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003450393676758 }, { "episode": 13424, "epoch": 0.08043043222969168, "loss/policy_avg": 0.3022631108760834, "lr": 9.464468302658487e-06, "objective/entropy": -122.02997589111328, "objective/kl": 32.87577438354492, "objective/non_score_reward": -1.6437886953353882, "objective/rlhf_reward": -5.196552612868649, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.3451852798461914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 838, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992682933807373 }, { "episode": 13440, "epoch": 0.08052629686882122, "loss/policy_avg": 0.09435372054576874, "lr": 9.463829243353784e-06, "objective/entropy": -228.3193817138672, "objective/kl": 27.057086944580078, "objective/non_score_reward": -1.3528543710708618, "objective/rlhf_reward": -3.7495579771405323, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 64.43006896972656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.642578125, "step": 839, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9953296184539795 }, { "episode": 13456, "epoch": 0.08062216150795078, "loss/policy_avg": 1.2935261726379395, "lr": 9.46319018404908e-06, "objective/entropy": -160.080322265625, "objective/kl": 34.4007568359375, "objective/non_score_reward": -1.7200379371643066, "objective/rlhf_reward": -5.538515916376738, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 131.64187622070312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6015625, "step": 840, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983408451080322 }, { "episode": 13472, "epoch": 0.08071802614708032, "loss/policy_avg": 0.5178288817405701, "lr": 9.462551124744378e-06, "objective/entropy": -140.98907470703125, "objective/kl": 32.42417526245117, "objective/non_score_reward": -1.621208906173706, "objective/rlhf_reward": -4.084835386276245, "objective/scores": 0.6, "policy/approxkl_avg": 2.9638893604278564, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 841, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993921518325806 }, { "episode": 13488, "epoch": 0.08081389078620987, "loss/policy_avg": 1.674887776374817, "lr": 9.461912065439673e-06, "objective/entropy": -140.6572723388672, "objective/kl": 33.64493179321289, "objective/non_score_reward": -1.682246446609497, "objective/rlhf_reward": -2.3289861440658566, "objective/scores": 1.1, "policy/approxkl_avg": 2.7393760681152344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.73828125, "step": 842, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0029544830322266 }, { "episode": 13504, "epoch": 0.08090975542533942, "loss/policy_avg": 0.10809826105833054, "lr": 9.46127300613497e-06, "objective/entropy": 33.49109649658203, "objective/kl": 46.121177673339844, "objective/non_score_reward": -2.3060591220855713, "objective/rlhf_reward": -7.399407501491616, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.247078895568848, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7578125, "step": 843, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997997283935547 }, { "episode": 13520, "epoch": 0.08100562006446897, "loss/policy_avg": 0.08235388994216919, "lr": 9.460633946830267e-06, "objective/entropy": -232.94918823242188, "objective/kl": 29.242427825927734, "objective/non_score_reward": -1.4621214866638184, "objective/rlhf_reward": -5.848485827445984, "objective/scores": 0.0, "policy/approxkl_avg": 7.9668121337890625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.708984375, "step": 844, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998790979385376 }, { "episode": 13536, "epoch": 0.08110148470359851, "loss/policy_avg": 0.15575401484966278, "lr": 9.459994887525563e-06, "objective/entropy": -230.47235107421875, "objective/kl": 39.588829040527344, "objective/non_score_reward": -1.9794416427612305, "objective/rlhf_reward": -6.401994669231113, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 3.700314521789551, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.708984375, "step": 845, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9979462623596191 }, { "episode": 13552, "epoch": 0.08119734934272807, "loss/policy_avg": 0.13659973442554474, "lr": 9.45935582822086e-06, "objective/entropy": -174.33474731445312, "objective/kl": 28.351903915405273, "objective/non_score_reward": -1.4175951480865479, "objective/rlhf_reward": -2.746661697269651, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.24754524230957, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.705078125, "step": 846, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0036792755126953 }, { "episode": 13568, "epoch": 0.08129321398185761, "loss/policy_avg": -0.0010715574026107788, "lr": 9.458716768916156e-06, "objective/entropy": -106.94636535644531, "objective/kl": 43.695289611816406, "objective/non_score_reward": -2.1847643852233887, "objective/rlhf_reward": -7.077198391378509, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 4.114851474761963, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859375, "step": 847, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0022311210632324 }, { "episode": 13584, "epoch": 0.08138907862098717, "loss/policy_avg": -0.020745811983942986, "lr": 9.458077709611452e-06, "objective/entropy": -274.30377197265625, "objective/kl": 29.099441528320312, "objective/non_score_reward": -1.4549720287322998, "objective/rlhf_reward": -4.215768191877919, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.374234199523926, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.55078125, "step": 848, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0002601146698 }, { "episode": 13600, "epoch": 0.08148494326011671, "loss/policy_avg": 0.08369505405426025, "lr": 9.45743865030675e-06, "objective/entropy": -90.9344482421875, "objective/kl": 32.62782669067383, "objective/non_score_reward": -1.6313912868499756, "objective/rlhf_reward": -4.921445462767201, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.873699426651001, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 849, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983861446380615 }, { "episode": 13616, "epoch": 0.08158080789924627, "loss/policy_avg": 0.12610237300395966, "lr": 9.456799591002046e-06, "objective/entropy": -216.01071166992188, "objective/kl": 31.95155906677246, "objective/non_score_reward": -1.5975778102874756, "objective/rlhf_reward": -5.048675945311218, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 17.690187454223633, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.630859375, "step": 850, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975669384002686 }, { "episode": 13632, "epoch": 0.08167667253837581, "loss/policy_avg": 0.09207138419151306, "lr": 9.456160531697343e-06, "objective/entropy": -213.504638671875, "objective/kl": 33.958152770996094, "objective/non_score_reward": -1.69790780544281, "objective/rlhf_reward": -5.413029053298336, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.278407096862793, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.712890625, "step": 851, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982938766479492 }, { "episode": 13648, "epoch": 0.08177253717750536, "loss/policy_avg": 0.7879657745361328, "lr": 9.45552147239264e-06, "objective/entropy": -179.40536499023438, "objective/kl": 38.20147705078125, "objective/non_score_reward": -1.91007399559021, "objective/rlhf_reward": -6.216463644702998, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.275893211364746, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.52734375, "step": 852, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000800609588623 }, { "episode": 13664, "epoch": 0.0818684018166349, "loss/policy_avg": -0.05168546736240387, "lr": 9.454882413087935e-06, "objective/entropy": -252.6636505126953, "objective/kl": 36.603004455566406, "objective/non_score_reward": -1.8301501274108887, "objective/rlhf_reward": -5.65874100250064, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.8799333572387695, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.734375, "step": 853, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000197410583496 }, { "episode": 13680, "epoch": 0.08196426645576446, "loss/policy_avg": 0.35540589690208435, "lr": 9.454243353783232e-06, "objective/entropy": -263.326171875, "objective/kl": 31.936683654785156, "objective/non_score_reward": -1.5968341827392578, "objective/rlhf_reward": -5.0618239379226395, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 9.10447883605957, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.591796875, "step": 854, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981427192687988 }, { "episode": 13696, "epoch": 0.082060131094894, "loss/policy_avg": -0.01314299926161766, "lr": 9.453604294478529e-06, "objective/entropy": -50.54448699951172, "objective/kl": 27.010623931884766, "objective/non_score_reward": -1.3505312204360962, "objective/rlhf_reward": -4.002124941349029, "objective/scores": 0.35, "policy/approxkl_avg": 72.71121215820312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.755859375, "step": 855, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9988174438476562 }, { "episode": 13712, "epoch": 0.08215599573402356, "loss/policy_avg": 0.507459282875061, "lr": 9.452965235173824e-06, "objective/entropy": -196.7661590576172, "objective/kl": 41.39533615112305, "objective/non_score_reward": -2.0697667598724365, "objective/rlhf_reward": -6.331655929760869, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 6.633426666259766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.734375, "step": 856, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9997053146362305 }, { "episode": 13728, "epoch": 0.0822518603731531, "loss/policy_avg": 0.01022842712700367, "lr": 9.452326175869121e-06, "objective/entropy": -165.575439453125, "objective/kl": 28.162111282348633, "objective/non_score_reward": -1.408105492591858, "objective/rlhf_reward": -4.253819801894528, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.566072463989258, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.673828125, "step": 857, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994425773620605 }, { "episode": 13744, "epoch": 0.08234772501228266, "loss/policy_avg": 0.5199975371360779, "lr": 9.451687116564418e-06, "objective/entropy": -191.289794921875, "objective/kl": 25.639848709106445, "objective/non_score_reward": -1.2819924354553223, "objective/rlhf_reward": -3.6121978996121253, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.2938754558563232, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4873046875, "step": 858, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993865489959717 }, { "episode": 13760, "epoch": 0.0824435896514122, "loss/policy_avg": -0.09089094400405884, "lr": 9.451048057259715e-06, "objective/entropy": -222.6432647705078, "objective/kl": 35.101905822753906, "objective/non_score_reward": -1.7550954818725586, "objective/rlhf_reward": -5.641779580203396, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.5215179920196533, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.701171875, "step": 859, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0004208087921143 }, { "episode": 13776, "epoch": 0.08253945429054176, "loss/policy_avg": 0.3994244635105133, "lr": 9.45040899795501e-06, "objective/entropy": -232.05795288085938, "objective/kl": 35.13083267211914, "objective/non_score_reward": -1.7565417289733887, "objective/rlhf_reward": -2.6261669158935543, "objective/scores": 1.1, "policy/approxkl_avg": 7.337094306945801, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.736328125, "step": 860, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000622272491455 }, { "episode": 13792, "epoch": 0.0826353189296713, "loss/policy_avg": 0.241072878241539, "lr": 9.449769938650307e-06, "objective/entropy": -235.5231475830078, "objective/kl": 42.96981430053711, "objective/non_score_reward": -2.1484906673431396, "objective/rlhf_reward": -6.860629336039224, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 5.666136264801025, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 861, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9980902671813965 }, { "episode": 13808, "epoch": 0.08273118356880085, "loss/policy_avg": 0.06892701238393784, "lr": 9.449130879345604e-06, "objective/entropy": -43.37392044067383, "objective/kl": 28.94279670715332, "objective/non_score_reward": -1.447139859199524, "objective/rlhf_reward": -4.446923902540832, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 13.682140350341797, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.873046875, "step": 862, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998504638671875 }, { "episode": 13824, "epoch": 0.0828270482079304, "loss/policy_avg": 0.05104389786720276, "lr": 9.4484918200409e-06, "objective/entropy": -274.24462890625, "objective/kl": 26.58008575439453, "objective/non_score_reward": -1.3290044069290161, "objective/rlhf_reward": -3.6541578821545704, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.673041343688965, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6796875, "step": 863, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998407244682312 }, { "episode": 13840, "epoch": 0.08292291284705995, "loss/policy_avg": 2.0433521270751953, "lr": 9.447852760736197e-06, "objective/entropy": -141.08175659179688, "objective/kl": 38.66474151611328, "objective/non_score_reward": -1.933237075805664, "objective/rlhf_reward": -6.282350401492462, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.4866771697998047, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.849609375, "step": 864, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0020241737365723 }, { "episode": 13856, "epoch": 0.08301877748618951, "loss/policy_avg": 0.5822303891181946, "lr": 9.447213701431494e-06, "objective/entropy": -75.44483184814453, "objective/kl": 42.41981506347656, "objective/non_score_reward": -2.1209909915924072, "objective/rlhf_reward": -7.033365587802276, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 1.0502395629882812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.763671875, "step": 865, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0002496242523193 }, { "episode": 13872, "epoch": 0.08311464212531905, "loss/policy_avg": 1.5961978435516357, "lr": 9.44657464212679e-06, "objective/entropy": -102.62336730957031, "objective/kl": 32.63710021972656, "objective/non_score_reward": -1.6318550109863281, "objective/rlhf_reward": -4.702591176303934, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 44.34449005126953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.583984375, "step": 866, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997666597366333 }, { "episode": 13888, "epoch": 0.0832105067644486, "loss/policy_avg": -0.06377097964286804, "lr": 9.445935582822086e-06, "objective/entropy": -179.53016662597656, "objective/kl": 27.1846981048584, "objective/non_score_reward": -1.3592349290847778, "objective/rlhf_reward": -3.3142334840455394, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.25791072845459, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.67578125, "step": 867, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001521110534668 }, { "episode": 13904, "epoch": 0.08330637140357815, "loss/policy_avg": 0.06122337281703949, "lr": 9.445296523517383e-06, "objective/entropy": -160.8975830078125, "objective/kl": 37.28607940673828, "objective/non_score_reward": -1.8643040657043457, "objective/rlhf_reward": -6.131703171759767, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.380110263824463, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.685546875, "step": 868, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993302822113037 }, { "episode": 13920, "epoch": 0.0834022360427077, "loss/policy_avg": 0.06397978216409683, "lr": 9.44465746421268e-06, "objective/entropy": -279.75146484375, "objective/kl": 36.54051971435547, "objective/non_score_reward": -1.8270260095596313, "objective/rlhf_reward": -5.3606928093003585, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 9.166413307189941, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6953125, "step": 869, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998555064201355 }, { "episode": 13936, "epoch": 0.08349810068183725, "loss/policy_avg": 0.18339544534683228, "lr": 9.444018404907977e-06, "objective/entropy": -197.06088256835938, "objective/kl": 35.413883209228516, "objective/non_score_reward": -1.7706942558288574, "objective/rlhf_reward": -5.420917516172516, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 2.4228196144104004, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.55859375, "step": 870, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9997130632400513 }, { "episode": 13952, "epoch": 0.0835939653209668, "loss/policy_avg": 0.7395508885383606, "lr": 9.443379345603272e-06, "objective/entropy": -175.5420684814453, "objective/kl": 27.310260772705078, "objective/non_score_reward": -1.3655130863189697, "objective/rlhf_reward": -3.9057928611903936, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 20.016393661499023, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.744140625, "step": 871, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9980931282043457 }, { "episode": 13968, "epoch": 0.08368982996009634, "loss/policy_avg": 0.11419187486171722, "lr": 9.442740286298569e-06, "objective/entropy": -202.19219970703125, "objective/kl": 26.73446273803711, "objective/non_score_reward": -1.3367230892181396, "objective/rlhf_reward": -0.9468923568725582, "objective/scores": 1.1, "policy/approxkl_avg": 1.4593892097473145, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.505859375, "step": 872, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997605085372925 }, { "episode": 13984, "epoch": 0.0837856945992259, "loss/policy_avg": 0.10254265367984772, "lr": 9.442101226993866e-06, "objective/entropy": -181.49607849121094, "objective/kl": 34.489620208740234, "objective/non_score_reward": -1.7244811058044434, "objective/rlhf_reward": -5.2938043213525585, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 10.685236930847168, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.572265625, "step": 873, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004138946533203 }, { "episode": 14000, "epoch": 0.08388155923835544, "loss/policy_avg": -0.11048807948827744, "lr": 9.441462167689163e-06, "objective/entropy": -233.58718872070312, "objective/kl": 27.196325302124023, "objective/non_score_reward": -1.359816312789917, "objective/rlhf_reward": -4.080015146468563, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.074767112731934, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.599609375, "step": 874, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000337600708008 }, { "episode": 14016, "epoch": 0.083977423877485, "loss/policy_avg": -0.04991217330098152, "lr": 9.44082310838446e-06, "objective/entropy": -147.29574584960938, "objective/kl": 39.145992279052734, "objective/non_score_reward": -1.9572995901107788, "objective/rlhf_reward": -6.429198360443115, "objective/scores": 0.35, "policy/approxkl_avg": 2.3655714988708496, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.703125, "step": 875, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0024516582489014 }, { "episode": 14032, "epoch": 0.08407328851661454, "loss/policy_avg": 0.018214020878076553, "lr": 9.440184049079757e-06, "objective/entropy": -225.25274658203125, "objective/kl": 28.496929168701172, "objective/non_score_reward": -1.4248464107513428, "objective/rlhf_reward": -4.248787502856597, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 8.280494689941406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.78515625, "step": 876, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0028023719787598 }, { "episode": 14048, "epoch": 0.0841691531557441, "loss/policy_avg": -0.0712839737534523, "lr": 9.439544989775052e-06, "objective/entropy": -111.49925231933594, "objective/kl": 33.307395935058594, "objective/non_score_reward": -1.6653697490692139, "objective/rlhf_reward": -5.237647135456172, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 10.649118423461914, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.744140625, "step": 877, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0005428791046143 }, { "episode": 14064, "epoch": 0.08426501779487364, "loss/policy_avg": 0.416260302066803, "lr": 9.438905930470349e-06, "objective/entropy": -91.5921630859375, "objective/kl": 36.07551193237305, "objective/non_score_reward": -1.8037755489349365, "objective/rlhf_reward": -5.658842890468195, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 14.971528053283691, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.546875, "step": 878, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99779212474823 }, { "episode": 14080, "epoch": 0.08436088243400319, "loss/policy_avg": 0.6945221424102783, "lr": 9.438266871165644e-06, "objective/entropy": -103.2996597290039, "objective/kl": 29.02838706970215, "objective/non_score_reward": -1.4514193534851074, "objective/rlhf_reward": -4.249418287482813, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.5951104164123535, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6015625, "step": 879, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996235370635986 }, { "episode": 14096, "epoch": 0.08445674707313273, "loss/policy_avg": 0.14096970856189728, "lr": 9.43762781186094e-06, "objective/entropy": -250.6915283203125, "objective/kl": 24.03522491455078, "objective/non_score_reward": -1.2017613649368286, "objective/rlhf_reward": -3.4284433508790553, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 141.8468017578125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.681640625, "step": 880, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993736743927002 }, { "episode": 14112, "epoch": 0.08455261171226229, "loss/policy_avg": 0.3699185848236084, "lr": 9.436988752556238e-06, "objective/entropy": -159.3045196533203, "objective/kl": 40.019386291503906, "objective/non_score_reward": -2.000969409942627, "objective/rlhf_reward": -6.553278903575286, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 8.20317554473877, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.775390625, "step": 881, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977927207946777 }, { "episode": 14128, "epoch": 0.08464847635139183, "loss/policy_avg": 0.41995298862457275, "lr": 9.436349693251534e-06, "objective/entropy": 76.95626068115234, "objective/kl": 39.00627899169922, "objective/non_score_reward": -1.9503138065338135, "objective/rlhf_reward": -6.139395838201629, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 31.75859832763672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.568359375, "step": 882, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9953019618988037 }, { "episode": 14144, "epoch": 0.08474434099052139, "loss/policy_avg": 0.5355075597763062, "lr": 9.435710633946831e-06, "objective/entropy": -164.35186767578125, "objective/kl": 42.27740478515625, "objective/non_score_reward": -2.113870143890381, "objective/rlhf_reward": -7.113845041304259, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 20.66805648803711, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.72265625, "step": 883, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9979139566421509 }, { "episode": 14160, "epoch": 0.08484020562965093, "loss/policy_avg": 0.12046757340431213, "lr": 9.435071574642126e-06, "objective/entropy": -139.48226928710938, "objective/kl": 35.96446228027344, "objective/non_score_reward": -1.7982230186462402, "objective/rlhf_reward": -5.833642208312435, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 5.584999084472656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.763671875, "step": 884, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985802173614502 }, { "episode": 14176, "epoch": 0.08493607026878049, "loss/policy_avg": 0.20259422063827515, "lr": 9.434432515337423e-06, "objective/entropy": -194.32472229003906, "objective/kl": 29.422592163085938, "objective/non_score_reward": -1.4711295366287231, "objective/rlhf_reward": -4.151184813181559, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 7.590093612670898, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.705078125, "step": 885, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000078678131104 }, { "episode": 14192, "epoch": 0.08503193490791003, "loss/policy_avg": 0.38378089666366577, "lr": 9.43379345603272e-06, "objective/entropy": -204.80718994140625, "objective/kl": 26.858444213867188, "objective/non_score_reward": -1.3429222106933594, "objective/rlhf_reward": -3.947856862743465, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 54.279869079589844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8828125, "step": 886, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000415086746216 }, { "episode": 14208, "epoch": 0.08512779954703958, "loss/policy_avg": 0.27804744243621826, "lr": 9.433154396728017e-06, "objective/entropy": -216.76026916503906, "objective/kl": 31.35245704650879, "objective/non_score_reward": -1.5676229000091553, "objective/rlhf_reward": -4.928855529337554, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 44.15214157104492, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.841796875, "step": 887, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985570907592773 }, { "episode": 14224, "epoch": 0.08522366418616913, "loss/policy_avg": 0.1285010725259781, "lr": 9.432515337423314e-06, "objective/entropy": -256.2292175292969, "objective/kl": 22.457351684570312, "objective/non_score_reward": -1.1228675842285156, "objective/rlhf_reward": -2.5440589291619613, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.694319725036621, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.591796875, "step": 888, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9982428550720215 }, { "episode": 14240, "epoch": 0.08531952882529868, "loss/policy_avg": 0.1620079129934311, "lr": 9.431876278118611e-06, "objective/entropy": -246.3665313720703, "objective/kl": 32.27862548828125, "objective/non_score_reward": -1.6139311790466309, "objective/rlhf_reward": -5.03189285536584, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.128833293914795, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.626953125, "step": 889, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0017335414886475 }, { "episode": 14256, "epoch": 0.08541539346442822, "loss/policy_avg": 0.6714350581169128, "lr": 9.431237218813906e-06, "objective/entropy": -87.00444793701172, "objective/kl": 30.12160873413086, "objective/non_score_reward": -1.5060807466506958, "objective/rlhf_reward": -4.600490768154231, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 30.543041229248047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 890, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9955544471740723 }, { "episode": 14272, "epoch": 0.08551125810355778, "loss/policy_avg": 0.5368032455444336, "lr": 9.430598159509203e-06, "objective/entropy": -151.2410125732422, "objective/kl": 23.1306095123291, "objective/non_score_reward": -1.1565306186676025, "objective/rlhf_reward": -3.266872340176983, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 18.648775100708008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 891, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999136209487915 }, { "episode": 14288, "epoch": 0.08560712274268732, "loss/policy_avg": -0.4043048024177551, "lr": 9.4299591002045e-06, "objective/entropy": -214.12281799316406, "objective/kl": 38.173484802246094, "objective/non_score_reward": -1.9086743593215942, "objective/rlhf_reward": -5.972837810934173, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 3.6675243377685547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.580078125, "step": 892, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000812292098999 }, { "episode": 14304, "epoch": 0.08570298738181688, "loss/policy_avg": 1.0885683298110962, "lr": 9.429320040899797e-06, "objective/entropy": -234.37998962402344, "objective/kl": 27.995094299316406, "objective/non_score_reward": -1.3997547626495361, "objective/rlhf_reward": -4.239769363139553, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.2649099826812744, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6640625, "step": 893, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0003528594970703 }, { "episode": 14320, "epoch": 0.08579885202094642, "loss/policy_avg": -0.1013278141617775, "lr": 9.428680981595094e-06, "objective/entropy": -156.33245849609375, "objective/kl": 35.587982177734375, "objective/non_score_reward": -1.779399037361145, "objective/rlhf_reward": -5.738993861762387, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 8.389669418334961, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.666015625, "step": 894, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.99942946434021 }, { "episode": 14336, "epoch": 0.08589471666007598, "loss/policy_avg": -0.006531273480504751, "lr": 9.42804192229039e-06, "objective/entropy": -197.26820373535156, "objective/kl": 21.04766082763672, "objective/non_score_reward": -1.0523829460144043, "objective/rlhf_reward": -2.6532727172046453, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.4280903339385986, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.69140625, "step": 895, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001786708831787 }, { "episode": 14352, "epoch": 0.08599058129920552, "loss/policy_avg": 0.10259456932544708, "lr": 9.427402862985686e-06, "objective/entropy": -120.49540710449219, "objective/kl": 37.17432403564453, "objective/non_score_reward": -1.858716368675232, "objective/rlhf_reward": -3.034865355491638, "objective/scores": 1.1, "policy/approxkl_avg": 6.6070685386657715, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.615234375, "step": 896, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999729871749878 }, { "episode": 14368, "epoch": 0.08608644593833507, "loss/policy_avg": 0.18344524502754211, "lr": 9.426763803680982e-06, "objective/entropy": -84.0172348022461, "objective/kl": 32.38622283935547, "objective/non_score_reward": -1.6193112134933472, "objective/rlhf_reward": -5.117994987700863, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 33.82829284667969, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.529296875, "step": 897, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998817443847656 }, { "episode": 14384, "epoch": 0.08618231057746462, "loss/policy_avg": 0.7863380312919617, "lr": 9.42612474437628e-06, "objective/entropy": -94.4057388305664, "objective/kl": 31.75823402404785, "objective/non_score_reward": -1.58791184425354, "objective/rlhf_reward": -3.4279281839143962, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.406008243560791, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.8125, "step": 898, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0038909912109375 }, { "episode": 14400, "epoch": 0.08627817521659417, "loss/policy_avg": 0.5351603031158447, "lr": 9.425485685071576e-06, "objective/entropy": -265.2181396484375, "objective/kl": 29.21182632446289, "objective/non_score_reward": -1.460591197013855, "objective/rlhf_reward": -4.1090314547220865, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.054888725280762, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.57421875, "step": 899, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998013973236084 }, { "episode": 14416, "epoch": 0.08637403985572371, "loss/policy_avg": 0.013534091413021088, "lr": 9.424846625766873e-06, "objective/entropy": -194.56564331054688, "objective/kl": 24.970386505126953, "objective/non_score_reward": -1.2485194206237793, "objective/rlhf_reward": -3.6348278162225913, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 0.42985397577285767, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.53515625, "step": 900, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002646446228027 }, { "episode": 14432, "epoch": 0.08646990449485327, "loss/policy_avg": -0.026506464928388596, "lr": 9.424207566462168e-06, "objective/entropy": -121.82954406738281, "objective/kl": 38.97528839111328, "objective/non_score_reward": -1.9487645626068115, "objective/rlhf_reward": -7.795057892799377, "objective/scores": 0.0, "policy/approxkl_avg": 18.97709846496582, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.3564453125, "step": 901, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988110065460205 }, { "episode": 14448, "epoch": 0.08656576913398281, "loss/policy_avg": 0.04643288254737854, "lr": 9.423568507157465e-06, "objective/entropy": -97.38468170166016, "objective/kl": 28.042333602905273, "objective/non_score_reward": -1.4021167755126953, "objective/rlhf_reward": -4.184634823997585, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.1407618522644043, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.60546875, "step": 902, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999892711639404 }, { "episode": 14464, "epoch": 0.08666163377311237, "loss/policy_avg": 0.5154027342796326, "lr": 9.42292944785276e-06, "objective/entropy": -250.2370147705078, "objective/kl": 25.91543960571289, "objective/non_score_reward": -1.2957720756530762, "objective/rlhf_reward": -3.759255845745174, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.9840008020401, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.607421875, "step": 903, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984569549560547 }, { "episode": 14480, "epoch": 0.08675749841224191, "loss/policy_avg": -0.12090878188610077, "lr": 9.422290388548057e-06, "objective/entropy": -224.9342041015625, "objective/kl": 21.860130310058594, "objective/non_score_reward": -1.0930064916610718, "objective/rlhf_reward": -2.42461485691541, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 6.253545761108398, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.751953125, "step": 904, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000030755996704 }, { "episode": 14496, "epoch": 0.08685336305137147, "loss/policy_avg": 0.2192097306251526, "lr": 9.421651329243354e-06, "objective/entropy": -116.75704956054688, "objective/kl": 40.641937255859375, "objective/non_score_reward": -2.0320968627929688, "objective/rlhf_reward": -6.786751320868163, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.1222383975982666, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.525390625, "step": 905, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990143775939941 }, { "episode": 14512, "epoch": 0.08694922769050101, "loss/policy_avg": 0.014911421574652195, "lr": 9.421012269938651e-06, "objective/entropy": -169.34967041015625, "objective/kl": 19.47471809387207, "objective/non_score_reward": -0.9737359285354614, "objective/rlhf_reward": -1.7722373626389838, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 3.2120165824890137, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.65234375, "step": 906, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0021252632141113 }, { "episode": 14528, "epoch": 0.08704509232963056, "loss/policy_avg": -0.06861399114131927, "lr": 9.420373210633948e-06, "objective/entropy": -199.73748779296875, "objective/kl": 32.33728790283203, "objective/non_score_reward": -1.6168644428253174, "objective/rlhf_reward": -5.088855722037655, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 7.329561233520508, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.58203125, "step": 907, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.1414122581481934 }, { "episode": 14544, "epoch": 0.0871409569687601, "loss/policy_avg": -0.0006491807289421558, "lr": 9.419734151329245e-06, "objective/entropy": -241.5037078857422, "objective/kl": 26.676612854003906, "objective/non_score_reward": -1.3338308334350586, "objective/rlhf_reward": -0.9353229761123654, "objective/scores": 1.1, "policy/approxkl_avg": 2.882882595062256, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.568359375, "step": 908, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995675086975098 }, { "episode": 14560, "epoch": 0.08723682160788966, "loss/policy_avg": -0.30844664573669434, "lr": 9.41909509202454e-06, "objective/entropy": -193.48281860351562, "objective/kl": 32.22890853881836, "objective/non_score_reward": -1.6114455461502075, "objective/rlhf_reward": -4.712448493639627, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 9.586688995361328, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.537109375, "step": 909, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0032057762145996 }, { "episode": 14576, "epoch": 0.0873326862470192, "loss/policy_avg": 0.10456671565771103, "lr": 9.418456032719837e-06, "objective/entropy": -214.8862762451172, "objective/kl": 30.845359802246094, "objective/non_score_reward": -1.5422677993774414, "objective/rlhf_reward": -4.769071197509765, "objective/scores": 0.35, "policy/approxkl_avg": 48.766883850097656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.560546875, "step": 910, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0017874240875244 }, { "episode": 14592, "epoch": 0.08742855088614876, "loss/policy_avg": 0.011322952806949615, "lr": 9.417816973415134e-06, "objective/entropy": -148.18869018554688, "objective/kl": 34.653785705566406, "objective/non_score_reward": -1.7326891422271729, "objective/rlhf_reward": -5.5069247080880075, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.303962230682373, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.740234375, "step": 911, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.001537561416626 }, { "episode": 14608, "epoch": 0.0875244155252783, "loss/policy_avg": 1.4446654319763184, "lr": 9.41717791411043e-06, "objective/entropy": -151.7039337158203, "objective/kl": 36.139678955078125, "objective/non_score_reward": -1.8069840669631958, "objective/rlhf_reward": -5.623816165987568, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 8.342704772949219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7421875, "step": 912, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997730016708374 }, { "episode": 14624, "epoch": 0.08762028016440786, "loss/policy_avg": 0.007501431740820408, "lr": 9.416538854805727e-06, "objective/entropy": -192.82723999023438, "objective/kl": 28.006526947021484, "objective/non_score_reward": -1.4003264904022217, "objective/rlhf_reward": -3.776477153572153, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 9.397720336914062, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.74609375, "step": 913, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9988361597061157 }, { "episode": 14640, "epoch": 0.0877161448035374, "loss/policy_avg": 0.7067223787307739, "lr": 9.415899795501023e-06, "objective/entropy": -199.13888549804688, "objective/kl": 40.245330810546875, "objective/non_score_reward": -2.0122666358947754, "objective/rlhf_reward": -6.387206798017608, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.6032171249389648, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.626953125, "step": 914, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0032029151916504 }, { "episode": 14656, "epoch": 0.08781200944266696, "loss/policy_avg": 0.7447987198829651, "lr": 9.41526073619632e-06, "objective/entropy": -192.03024291992188, "objective/kl": 33.84302520751953, "objective/non_score_reward": -1.6921510696411133, "objective/rlhf_reward": -5.212345330920771, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 12.58854866027832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.708984375, "step": 915, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982028007507324 }, { "episode": 14672, "epoch": 0.0879078740817965, "loss/policy_avg": -0.12448902428150177, "lr": 9.414621676891616e-06, "objective/entropy": -108.39199829101562, "objective/kl": 27.540185928344727, "objective/non_score_reward": -1.3770092725753784, "objective/rlhf_reward": -3.3853308580079418, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 0.6809393763542175, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.771484375, "step": 916, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0027718544006348 }, { "episode": 14688, "epoch": 0.08800373872092605, "loss/policy_avg": 0.09778769314289093, "lr": 9.413982617586913e-06, "objective/entropy": -83.20165252685547, "objective/kl": 27.68124008178711, "objective/non_score_reward": -1.3840619325637817, "objective/rlhf_reward": -3.7114191010323276, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.001269340515137, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 917, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9994606971740723 }, { "episode": 14704, "epoch": 0.0880996033600556, "loss/policy_avg": 0.7267050743103027, "lr": 9.41334355828221e-06, "objective/entropy": -174.48663330078125, "objective/kl": 32.38935089111328, "objective/non_score_reward": -1.6194674968719482, "objective/rlhf_reward": -6.477869987487793, "objective/scores": 0.0, "policy/approxkl_avg": 9.753436088562012, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6796875, "step": 918, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9973618984222412 }, { "episode": 14720, "epoch": 0.08819546799918515, "loss/policy_avg": 0.18099595606327057, "lr": 9.412704498977507e-06, "objective/entropy": -232.4264373779297, "objective/kl": 37.20670700073242, "objective/non_score_reward": -1.860335350036621, "objective/rlhf_reward": -6.115828309088869, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 41.893341064453125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.79296875, "step": 919, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002889633178711 }, { "episode": 14736, "epoch": 0.08829133263831469, "loss/policy_avg": 0.43639302253723145, "lr": 9.412065439672802e-06, "objective/entropy": -183.69644165039062, "objective/kl": 24.13558006286621, "objective/non_score_reward": -1.2067790031433105, "objective/rlhf_reward": -2.8797047836350753, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 30.2447509765625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.55078125, "step": 920, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992506504058838 }, { "episode": 14752, "epoch": 0.08838719727744425, "loss/policy_avg": 0.5567411780357361, "lr": 9.411426380368099e-06, "objective/entropy": -285.06512451171875, "objective/kl": 32.89839553833008, "objective/non_score_reward": -1.644919753074646, "objective/rlhf_reward": -4.8463457385698945, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 18.15423583984375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.609375, "step": 921, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997804880142212 }, { "episode": 14768, "epoch": 0.0884830619165738, "loss/policy_avg": -0.024971559643745422, "lr": 9.410787321063396e-06, "objective/entropy": -144.00473022460938, "objective/kl": 27.061277389526367, "objective/non_score_reward": -1.353063941001892, "objective/rlhf_reward": -4.033653714743954, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.530630111694336, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62890625, "step": 922, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0000722408294678 }, { "episode": 14784, "epoch": 0.08857892655570335, "loss/policy_avg": -0.49618157744407654, "lr": 9.410148261758691e-06, "objective/entropy": -37.43824768066406, "objective/kl": 35.81788635253906, "objective/non_score_reward": -1.7908943891525269, "objective/rlhf_reward": -5.501717870653259, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 4.131357192993164, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.4287109375, "step": 923, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0173897743225098 }, { "episode": 14800, "epoch": 0.0886747911948329, "loss/policy_avg": 0.05783979594707489, "lr": 9.409509202453988e-06, "objective/entropy": -154.13516235351562, "objective/kl": 46.57448196411133, "objective/non_score_reward": -2.3287243843078613, "objective/rlhf_reward": -7.653037791669952, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 2.5200886726379395, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5390625, "step": 924, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987819194793701 }, { "episode": 14816, "epoch": 0.08877065583396244, "loss/policy_avg": 0.034926094114780426, "lr": 9.408870143149285e-06, "objective/entropy": -221.52577209472656, "objective/kl": 35.47760772705078, "objective/non_score_reward": -1.7738804817199707, "objective/rlhf_reward": -5.614569070752024, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.4324332475662231, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66796875, "step": 925, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995288848876953 }, { "episode": 14832, "epoch": 0.088866520473092, "loss/policy_avg": 0.32427555322647095, "lr": 9.408231083844582e-06, "objective/entropy": -130.25445556640625, "objective/kl": 34.63972473144531, "objective/non_score_reward": -1.7319860458374023, "objective/rlhf_reward": -5.371684878078058, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.6408345699310303, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.568359375, "step": 926, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0012738704681396 }, { "episode": 14848, "epoch": 0.08896238511222154, "loss/policy_avg": -0.27763280272483826, "lr": 9.407592024539877e-06, "objective/entropy": -244.65667724609375, "objective/kl": 27.930646896362305, "objective/non_score_reward": -1.396532416343689, "objective/rlhf_reward": -3.7613009765473118, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 65.45894622802734, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.66015625, "step": 927, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.002328872680664 }, { "episode": 14864, "epoch": 0.0890582497513511, "loss/policy_avg": 0.39164024591445923, "lr": 9.406952965235174e-06, "objective/entropy": -92.6754150390625, "objective/kl": 40.35970687866211, "objective/non_score_reward": -2.0179853439331055, "objective/rlhf_reward": -5.148222361446592, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.222280502319336, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.587890625, "step": 928, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9965031147003174 }, { "episode": 14880, "epoch": 0.08915411439048064, "loss/policy_avg": 0.018820755183696747, "lr": 9.40631390593047e-06, "objective/entropy": -221.75802612304688, "objective/kl": 32.733848571777344, "objective/non_score_reward": -1.6366922855377197, "objective/rlhf_reward": -4.942649397913533, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.5601739883422852, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.67578125, "step": 929, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0016555786132812 }, { "episode": 14896, "epoch": 0.0892499790296102, "loss/policy_avg": 0.02956710010766983, "lr": 9.405674846625768e-06, "objective/entropy": -225.1991729736328, "objective/kl": 27.00541114807129, "objective/non_score_reward": -1.3502705097198486, "objective/rlhf_reward": -4.059446623831421, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 6.755413055419922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8046875, "step": 930, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001051664352417 }, { "episode": 14912, "epoch": 0.08934584366873974, "loss/policy_avg": 0.08642945438623428, "lr": 9.405035787321065e-06, "objective/entropy": -179.3356475830078, "objective/kl": 36.390193939208984, "objective/non_score_reward": -1.8195096254348755, "objective/rlhf_reward": -5.330627392010625, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 10.583852767944336, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4501953125, "step": 931, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999453067779541 }, { "episode": 14928, "epoch": 0.0894417083078693, "loss/policy_avg": 0.08466912060976028, "lr": 9.404396728016361e-06, "objective/entropy": -160.34024047851562, "objective/kl": 48.99607849121094, "objective/non_score_reward": -2.4498043060302734, "objective/rlhf_reward": -8.195096407000142, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 0.9886335134506226, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.666015625, "step": 932, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0025954246520996 }, { "episode": 14944, "epoch": 0.08953757294699884, "loss/policy_avg": 0.3508598804473877, "lr": 9.403757668711657e-06, "objective/entropy": -177.20993041992188, "objective/kl": 32.381324768066406, "objective/non_score_reward": -1.6190659999847412, "objective/rlhf_reward": -5.150751504927797, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 31.277324676513672, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4716796875, "step": 933, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9997992515563965 }, { "episode": 14960, "epoch": 0.08963343758612839, "loss/policy_avg": 0.11015394330024719, "lr": 9.403118609406953e-06, "objective/entropy": -203.39776611328125, "objective/kl": 32.743534088134766, "objective/non_score_reward": -1.637176752090454, "objective/rlhf_reward": -4.94458726412447, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.4484572410583496, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.734375, "step": 934, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9993422031402588 }, { "episode": 14976, "epoch": 0.08972930222525793, "loss/policy_avg": -0.10944172739982605, "lr": 9.40247955010225e-06, "objective/entropy": -241.4989013671875, "objective/kl": 21.90488052368164, "objective/non_score_reward": -1.0952439308166504, "objective/rlhf_reward": -3.0217259762033652, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.7654926776885986, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.689453125, "step": 935, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002392292022705 }, { "episode": 14992, "epoch": 0.08982516686438749, "loss/policy_avg": 0.9405217170715332, "lr": 9.401840490797547e-06, "objective/entropy": -237.89816284179688, "objective/kl": 25.436769485473633, "objective/non_score_reward": -1.2718384265899658, "objective/rlhf_reward": -3.663521905143825, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 6.3816022872924805, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.60546875, "step": 936, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998328685760498 }, { "episode": 15008, "epoch": 0.08992103150351703, "loss/policy_avg": 0.3327906131744385, "lr": 9.401201431492844e-06, "objective/entropy": -268.6925354003906, "objective/kl": 37.998870849609375, "objective/non_score_reward": -1.899943470954895, "objective/rlhf_reward": -6.0435144593387395, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 10.16036605834961, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.607421875, "step": 937, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984662532806396 }, { "episode": 15024, "epoch": 0.09001689614264659, "loss/policy_avg": -0.26467132568359375, "lr": 9.40056237218814e-06, "objective/entropy": -231.59254455566406, "objective/kl": 26.266529083251953, "objective/non_score_reward": -1.3133264780044556, "objective/rlhf_reward": -3.737534248622593, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 8.63685417175293, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6328125, "step": 938, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999743938446045 }, { "episode": 15040, "epoch": 0.09011276078177613, "loss/policy_avg": 0.2447420209646225, "lr": 9.399923312883436e-06, "objective/entropy": -278.01153564453125, "objective/kl": 27.628671646118164, "objective/non_score_reward": -1.3814334869384766, "objective/rlhf_reward": -4.147132196513516, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.7261061668395996, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.669921875, "step": 939, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990365505218506 }, { "episode": 15056, "epoch": 0.09020862542090569, "loss/policy_avg": 0.2600797414779663, "lr": 9.399284253578733e-06, "objective/entropy": -242.6852264404297, "objective/kl": 40.91444396972656, "objective/non_score_reward": -2.045722484588623, "objective/rlhf_reward": -6.060183467642341, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.501818656921387, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484375, "step": 940, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971075057983398 }, { "episode": 15072, "epoch": 0.09030449006003523, "loss/policy_avg": 0.3729836940765381, "lr": 9.39864519427403e-06, "objective/entropy": -225.56338500976562, "objective/kl": 34.106658935546875, "objective/non_score_reward": -1.7053331136703491, "objective/rlhf_reward": -5.340379836972117, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.6144325733184814, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.73828125, "step": 941, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977765083312988 }, { "episode": 15088, "epoch": 0.09040035469916478, "loss/policy_avg": 0.571183443069458, "lr": 9.398006134969327e-06, "objective/entropy": -109.51638793945312, "objective/kl": 57.49871826171875, "objective/non_score_reward": -2.8749358654022217, "objective/rlhf_reward": -9.895623478952961, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.06275749206543, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.736328125, "step": 942, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000272512435913 }, { "episode": 15104, "epoch": 0.09049621933829433, "loss/policy_avg": 0.7253443002700806, "lr": 9.397367075664624e-06, "objective/entropy": -69.86570739746094, "objective/kl": 40.12030029296875, "objective/non_score_reward": -2.0060153007507324, "objective/rlhf_reward": -6.362201397836792, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 66.08172607421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 943, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997114896774292 }, { "episode": 15120, "epoch": 0.09059208397742388, "loss/policy_avg": 0.7548943758010864, "lr": 9.396728016359919e-06, "objective/entropy": -264.1029357910156, "objective/kl": 29.125934600830078, "objective/non_score_reward": -1.456296682357788, "objective/rlhf_reward": -4.268927424159601, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.555539846420288, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6328125, "step": 944, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0007224082946777 }, { "episode": 15136, "epoch": 0.09068794861655342, "loss/policy_avg": -0.06224450469017029, "lr": 9.396088957055216e-06, "objective/entropy": -215.80255126953125, "objective/kl": 36.1290283203125, "objective/non_score_reward": -1.8064515590667725, "objective/rlhf_reward": -5.7100345728718604, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.062628746032715, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.5703125, "step": 945, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0022828578948975 }, { "episode": 15152, "epoch": 0.09078381325568298, "loss/policy_avg": -0.34320878982543945, "lr": 9.395449897750511e-06, "objective/entropy": -254.14260864257812, "objective/kl": 24.163818359375, "objective/non_score_reward": -1.20819091796875, "objective/rlhf_reward": -2.4327639102935787, "objective/scores": 0.6, "policy/approxkl_avg": 3.011139392852783, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.62109375, "step": 946, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0023422241210938 }, { "episode": 15168, "epoch": 0.09087967789481252, "loss/policy_avg": 0.08071097731590271, "lr": 9.394810838445808e-06, "objective/entropy": -269.91180419921875, "objective/kl": 29.857431411743164, "objective/non_score_reward": -1.4928715229034424, "objective/rlhf_reward": -3.8487801573434215, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.305149555206299, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646484375, "step": 947, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971578121185303 }, { "episode": 15184, "epoch": 0.09097554253394208, "loss/policy_avg": -0.019624732434749603, "lr": 9.394171779141105e-06, "objective/entropy": -274.10198974609375, "objective/kl": 33.219993591308594, "objective/non_score_reward": -1.6609996557235718, "objective/rlhf_reward": -6.643998503684998, "objective/scores": 0.0, "policy/approxkl_avg": 4.708046913146973, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015625, "step": 948, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000514030456543 }, { "episode": 15200, "epoch": 0.09107140717307162, "loss/policy_avg": -0.5435956716537476, "lr": 9.393532719836402e-06, "objective/entropy": -245.58270263671875, "objective/kl": 26.876476287841797, "objective/non_score_reward": -1.3438239097595215, "objective/rlhf_reward": -3.771175924603062, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 10.178674697875977, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.630859375, "step": 949, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0033979415893555 }, { "episode": 15216, "epoch": 0.09116727181220118, "loss/policy_avg": 0.6083466410636902, "lr": 9.392893660531698e-06, "objective/entropy": -169.32357788085938, "objective/kl": 38.449127197265625, "objective/non_score_reward": -1.9224563837051392, "objective/rlhf_reward": -6.133565931525782, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 8.572129249572754, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.685546875, "step": 950, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000035285949707 }, { "episode": 15232, "epoch": 0.09126313645133072, "loss/policy_avg": 0.1515914499759674, "lr": 9.392254601226994e-06, "objective/entropy": -181.75010681152344, "objective/kl": 31.95659637451172, "objective/non_score_reward": -1.5978299379348755, "objective/rlhf_reward": -5.04968385985437, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 12.761173248291016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.798828125, "step": 951, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977487325668335 }, { "episode": 15248, "epoch": 0.09135900109046027, "loss/policy_avg": 0.7638048529624939, "lr": 9.39161554192229e-06, "objective/entropy": -158.99050903320312, "objective/kl": 39.69103240966797, "objective/non_score_reward": -1.9845517873764038, "objective/rlhf_reward": -5.815500917212043, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.06544303894043, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.541015625, "step": 952, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981579780578613 }, { "episode": 15264, "epoch": 0.09145486572958982, "loss/policy_avg": 0.764492392539978, "lr": 9.390976482617587e-06, "objective/entropy": -159.26947021484375, "objective/kl": 28.415475845336914, "objective/non_score_reward": -1.4207737445831299, "objective/rlhf_reward": -5.683095276355743, "objective/scores": 0.0, "policy/approxkl_avg": 7.907594680786133, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6328125, "step": 953, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997883677482605 }, { "episode": 15280, "epoch": 0.09155073036871937, "loss/policy_avg": 0.3368009328842163, "lr": 9.390337423312884e-06, "objective/entropy": -173.85415649414062, "objective/kl": 35.513309478759766, "objective/non_score_reward": -1.775665521621704, "objective/rlhf_reward": -5.49854234224947, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 6.337751388549805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.541015625, "step": 954, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994667768478394 }, { "episode": 15296, "epoch": 0.09164659500784891, "loss/policy_avg": 0.0456845797598362, "lr": 9.389698364008181e-06, "objective/entropy": 16.725250244140625, "objective/kl": 36.44686508178711, "objective/non_score_reward": -1.822343349456787, "objective/rlhf_reward": -5.865541179378596, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 21.832763671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.548828125, "step": 955, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999651551246643 }, { "episode": 15312, "epoch": 0.09174245964697847, "loss/policy_avg": 0.0268879272043705, "lr": 9.389059304703478e-06, "objective/entropy": -219.0832977294922, "objective/kl": 25.021286010742188, "objective/non_score_reward": -1.2510643005371094, "objective/rlhf_reward": -3.1794285133209934, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 13.525361061096191, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.673828125, "step": 956, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0001580715179443 }, { "episode": 15328, "epoch": 0.09183832428610801, "loss/policy_avg": 0.25198429822921753, "lr": 9.388420245398773e-06, "objective/entropy": -216.4515838623047, "objective/kl": 29.98337173461914, "objective/non_score_reward": -1.4991683959960938, "objective/rlhf_reward": -3.0729548081171245, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 8.199630737304688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.634765625, "step": 957, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9965362548828125 }, { "episode": 15344, "epoch": 0.09193418892523757, "loss/policy_avg": 0.035516731441020966, "lr": 9.38778118609407e-06, "objective/entropy": -250.8704833984375, "objective/kl": 30.556961059570312, "objective/non_score_reward": -1.5278480052947998, "objective/rlhf_reward": -4.73278991231094, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.100607395172119, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.607421875, "step": 958, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0005221366882324 }, { "episode": 15360, "epoch": 0.09203005356436711, "loss/policy_avg": 0.6594608426094055, "lr": 9.387142126789367e-06, "objective/entropy": -190.2021942138672, "objective/kl": 29.693756103515625, "objective/non_score_reward": -1.4846878051757812, "objective/rlhf_reward": -4.38249173661764, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 11.999906539916992, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.55859375, "step": 959, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9964005947113037 }, { "episode": 15376, "epoch": 0.09212591820349667, "loss/policy_avg": 0.16847842931747437, "lr": 9.386503067484664e-06, "objective/entropy": -220.72311401367188, "objective/kl": 22.618806838989258, "objective/non_score_reward": -1.1309404373168945, "objective/rlhf_reward": -3.0731633110955805, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 1.5775080919265747, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.63671875, "step": 960, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0047149658203125 }, { "episode": 15392, "epoch": 0.09222178284262621, "loss/policy_avg": 0.37361010909080505, "lr": 9.38586400817996e-06, "objective/entropy": -219.60760498046875, "objective/kl": 31.668062210083008, "objective/non_score_reward": -1.58340322971344, "objective/rlhf_reward": -4.852660181935191, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 6.965027809143066, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.615234375, "step": 961, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9987406730651855 }, { "episode": 15408, "epoch": 0.09231764748175576, "loss/policy_avg": 0.3272181749343872, "lr": 9.385224948875256e-06, "objective/entropy": -200.26370239257812, "objective/kl": 38.33747100830078, "objective/non_score_reward": -1.916873574256897, "objective/rlhf_reward": -5.720082710461552, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.9499969482421875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.583984375, "step": 962, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983041286468506 }, { "episode": 15424, "epoch": 0.0924135121208853, "loss/policy_avg": 0.02453005313873291, "lr": 9.384585889570553e-06, "objective/entropy": -259.0159606933594, "objective/kl": 32.376686096191406, "objective/non_score_reward": -1.6188342571258545, "objective/rlhf_reward": -5.051504810054866, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.491250038146973, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.634765625, "step": 963, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0016989707946777 }, { "episode": 15440, "epoch": 0.09250937676001486, "loss/policy_avg": -0.1082817018032074, "lr": 9.38394683026585e-06, "objective/entropy": -136.52200317382812, "objective/kl": 34.37030792236328, "objective/non_score_reward": -1.718515396118164, "objective/rlhf_reward": -5.212202077329742, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.610563278198242, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.65234375, "step": 964, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993445873260498 }, { "episode": 15456, "epoch": 0.0926052413991444, "loss/policy_avg": 0.3635658025741577, "lr": 9.383307770961147e-06, "objective/entropy": -242.04705810546875, "objective/kl": 26.167871475219727, "objective/non_score_reward": -1.3083934783935547, "objective/rlhf_reward": -3.8335740923881527, "objective/scores": 0.35, "policy/approxkl_avg": 10.497917175292969, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.505859375, "step": 965, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998986840248108 }, { "episode": 15472, "epoch": 0.09270110603827396, "loss/policy_avg": 0.4805383086204529, "lr": 9.382668711656443e-06, "objective/entropy": -130.80931091308594, "objective/kl": 43.840057373046875, "objective/non_score_reward": -2.192002773284912, "objective/rlhf_reward": -6.368011450767517, "objective/scores": 0.6, "policy/approxkl_avg": 1.2675271034240723, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.619140625, "step": 966, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001443862915039 }, { "episode": 15488, "epoch": 0.0927969706774035, "loss/policy_avg": 0.9434456825256348, "lr": 9.382029652351739e-06, "objective/entropy": -116.85310363769531, "objective/kl": 55.79869842529297, "objective/non_score_reward": -2.7899351119995117, "objective/rlhf_reward": -9.426406518618265, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.6991868019104004, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.791015625, "step": 967, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00309157371521 }, { "episode": 15504, "epoch": 0.09289283531653306, "loss/policy_avg": 0.2830507755279541, "lr": 9.381390593047035e-06, "objective/entropy": -260.5260925292969, "objective/kl": 34.16276550292969, "objective/non_score_reward": -1.7081382274627686, "objective/rlhf_reward": -5.381954531283721, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.792706489562988, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.615234375, "step": 968, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985530376434326 }, { "episode": 15520, "epoch": 0.0929886999556626, "loss/policy_avg": 0.19756931066513062, "lr": 9.380751533742332e-06, "objective/entropy": -234.741455078125, "objective/kl": 25.891204833984375, "objective/non_score_reward": -1.2945603132247925, "objective/rlhf_reward": -3.055534782187019, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.262695789337158, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.640625, "step": 969, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0007271766662598 }, { "episode": 15536, "epoch": 0.09308456459479215, "loss/policy_avg": 0.0513734444975853, "lr": 9.380112474437628e-06, "objective/entropy": -195.60171508789062, "objective/kl": 35.50217819213867, "objective/non_score_reward": -1.775109052658081, "objective/rlhf_reward": -5.741186105941219, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.6989755630493164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.607421875, "step": 970, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0012567043304443 }, { "episode": 15552, "epoch": 0.0931804292339217, "loss/policy_avg": 0.1513216644525528, "lr": 9.379473415132924e-06, "objective/entropy": -245.57977294921875, "objective/kl": 23.89773941040039, "objective/non_score_reward": -1.1948869228363037, "objective/rlhf_reward": -4.7795480489730835, "objective/scores": 0.0, "policy/approxkl_avg": 6.129580020904541, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.69921875, "step": 971, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000563621520996 }, { "episode": 15568, "epoch": 0.09327629387305125, "loss/policy_avg": 0.041885554790496826, "lr": 9.378834355828221e-06, "objective/entropy": -261.82769775390625, "objective/kl": 24.18181037902832, "objective/non_score_reward": -1.2090904712677002, "objective/rlhf_reward": -3.457759955016476, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 9.62070369720459, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.533203125, "step": 972, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9969148635864258 }, { "episode": 15584, "epoch": 0.0933721585121808, "loss/policy_avg": 0.012015002779662609, "lr": 9.378195296523518e-06, "objective/entropy": -251.767333984375, "objective/kl": 27.563173294067383, "objective/non_score_reward": -1.378158688545227, "objective/rlhf_reward": -3.908514711920338, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.0967427492141724, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.634765625, "step": 973, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0006260871887207 }, { "episode": 15600, "epoch": 0.09346802315131035, "loss/policy_avg": -0.31819072365760803, "lr": 9.377556237218815e-06, "objective/entropy": -175.70556640625, "objective/kl": 28.285152435302734, "objective/non_score_reward": -1.4142576456069946, "objective/rlhf_reward": -4.052910540167408, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.37001371383667, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.505859375, "step": 974, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995312690734863 }, { "episode": 15616, "epoch": 0.09356388779043989, "loss/policy_avg": 0.6060304641723633, "lr": 9.37691717791411e-06, "objective/entropy": -34.974281311035156, "objective/kl": 35.56610107421875, "objective/non_score_reward": -1.7783050537109375, "objective/rlhf_reward": -5.59744867065781, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.845120906829834, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.853515625, "step": 975, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995267391204834 }, { "episode": 15632, "epoch": 0.09365975242956945, "loss/policy_avg": 0.1691616326570511, "lr": 9.376278118609407e-06, "objective/entropy": -173.51535034179688, "objective/kl": 40.181976318359375, "objective/non_score_reward": -2.009099006652832, "objective/rlhf_reward": -6.657793619719845, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.46673262119293213, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71875, "step": 976, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0020508766174316 }, { "episode": 15648, "epoch": 0.09375561706869899, "loss/policy_avg": 0.12263473123311996, "lr": 9.375639059304704e-06, "objective/entropy": -244.26974487304688, "objective/kl": 29.573442459106445, "objective/non_score_reward": -1.4786722660064697, "objective/rlhf_reward": -4.358429758754328, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.748386859893799, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.693359375, "step": 977, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9990627765655518 }, { "episode": 15664, "epoch": 0.09385148170782855, "loss/policy_avg": 1.4557695388793945, "lr": 9.375000000000001e-06, "objective/entropy": -133.55853271484375, "objective/kl": 45.2318229675293, "objective/non_score_reward": -2.2615909576416016, "objective/rlhf_reward": -7.530592167171177, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 4.7986626625061035, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.654296875, "step": 978, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999971389770508 }, { "episode": 15680, "epoch": 0.09394734634695809, "loss/policy_avg": 0.04724450409412384, "lr": 9.374360940695298e-06, "objective/entropy": -291.25103759765625, "objective/kl": 28.29153823852539, "objective/non_score_reward": -1.4145770072937012, "objective/rlhf_reward": -3.710896800236638, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.313387393951416, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.625, "step": 979, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993375539779663 }, { "episode": 15696, "epoch": 0.09404321098608764, "loss/policy_avg": 0.2293320745229721, "lr": 9.373721881390595e-06, "objective/entropy": -136.44857788085938, "objective/kl": 38.36898422241211, "objective/non_score_reward": -1.9184492826461792, "objective/rlhf_reward": -5.551090779081855, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.303453207015991, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.61328125, "step": 980, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999606966972351 }, { "episode": 15712, "epoch": 0.0941390756252172, "loss/policy_avg": 0.16989938914775848, "lr": 9.37308282208589e-06, "objective/entropy": -171.79864501953125, "objective/kl": 32.806495666503906, "objective/non_score_reward": -1.640324592590332, "objective/rlhf_reward": -4.613887022213872, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 8.31067180633545, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.603515625, "step": 981, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984290599822998 }, { "episode": 15728, "epoch": 0.09423494026434674, "loss/policy_avg": 0.7234645485877991, "lr": 9.372443762781187e-06, "objective/entropy": -219.93374633789062, "objective/kl": 26.91738510131836, "objective/non_score_reward": -1.3458693027496338, "objective/rlhf_reward": -0.9834773302078244, "objective/scores": 1.1, "policy/approxkl_avg": 1.4521507024765015, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.646484375, "step": 982, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003150463104248 }, { "episode": 15744, "epoch": 0.0943308049034763, "loss/policy_avg": 0.48133015632629395, "lr": 9.371804703476484e-06, "objective/entropy": -282.47552490234375, "objective/kl": 39.29179763793945, "objective/non_score_reward": -1.9645898342132568, "objective/rlhf_reward": -6.125026241938272, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 6.169063568115234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.720703125, "step": 983, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997374415397644 }, { "episode": 15760, "epoch": 0.09442666954260584, "loss/policy_avg": 0.1187177523970604, "lr": 9.37116564417178e-06, "objective/entropy": -158.33642578125, "objective/kl": 40.20547103881836, "objective/non_score_reward": -2.0102736949920654, "objective/rlhf_reward": -6.69945864966455, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.5165886878967285, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62109375, "step": 984, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981753826141357 }, { "episode": 15776, "epoch": 0.0945225341817354, "loss/policy_avg": 0.16677279770374298, "lr": 9.370526584867077e-06, "objective/entropy": -162.21728515625, "objective/kl": 33.61964797973633, "objective/non_score_reward": -1.6809823513031006, "objective/rlhf_reward": -5.323929286003112, "objective/scores": 0.35, "policy/approxkl_avg": 5.913999557495117, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.734375, "step": 985, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9968831539154053 }, { "episode": 15792, "epoch": 0.09461839882086494, "loss/policy_avg": 0.22338780760765076, "lr": 9.369887525562373e-06, "objective/entropy": -191.39588928222656, "objective/kl": 50.39151382446289, "objective/non_score_reward": -2.519575595855713, "objective/rlhf_reward": -8.416443472326385, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 45.444732666015625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.755859375, "step": 986, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998542308807373 }, { "episode": 15808, "epoch": 0.0947142634599945, "loss/policy_avg": 0.37791919708251953, "lr": 9.36924846625767e-06, "objective/entropy": -270.806396484375, "objective/kl": 29.205078125, "objective/non_score_reward": -1.4602539539337158, "objective/rlhf_reward": -5.841015696525574, "objective/scores": 0.0, "policy/approxkl_avg": 8.895004272460938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6875, "step": 987, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979805946350098 }, { "episode": 15824, "epoch": 0.09481012809912404, "loss/policy_avg": 0.7314577102661133, "lr": 9.368609406952966e-06, "objective/entropy": -174.33633422851562, "objective/kl": 41.00555419921875, "objective/non_score_reward": -2.0502774715423584, "objective/rlhf_reward": -6.77727790613946, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.151052474975586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59375, "step": 988, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998262882232666 }, { "episode": 15840, "epoch": 0.09490599273825359, "loss/policy_avg": 0.1200692355632782, "lr": 9.367970347648263e-06, "objective/entropy": -259.9232177734375, "objective/kl": 32.56160354614258, "objective/non_score_reward": -1.628080129623413, "objective/rlhf_reward": -5.112320518493652, "objective/scores": 0.35, "policy/approxkl_avg": 3.3896703720092773, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.615234375, "step": 989, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0001492500305176 }, { "episode": 15856, "epoch": 0.09500185737738313, "loss/policy_avg": 0.7871278524398804, "lr": 9.367331288343558e-06, "objective/entropy": -162.90664672851562, "objective/kl": 37.55353927612305, "objective/non_score_reward": -1.8776767253875732, "objective/rlhf_reward": -6.086875279148188, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 24.93891716003418, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7890625, "step": 990, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9959328174591064 }, { "episode": 15872, "epoch": 0.09509772201651269, "loss/policy_avg": -0.12516134977340698, "lr": 9.366692229038855e-06, "objective/entropy": -238.83116149902344, "objective/kl": 37.03616714477539, "objective/non_score_reward": -1.8518084287643433, "objective/rlhf_reward": -6.047983967994137, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 15.576482772827148, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.681640625, "step": 991, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985809326171875 }, { "episode": 15888, "epoch": 0.09519358665564223, "loss/policy_avg": -0.04968651384115219, "lr": 9.366053169734152e-06, "objective/entropy": -183.43231201171875, "objective/kl": 35.40851593017578, "objective/non_score_reward": -1.77042555809021, "objective/rlhf_reward": -5.756189737349672, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 0.5774535536766052, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.583984375, "step": 992, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002680778503418 }, { "episode": 15904, "epoch": 0.09528945129477179, "loss/policy_avg": 0.009859908372163773, "lr": 9.365414110429449e-06, "objective/entropy": -14.670166015625, "objective/kl": 53.70581817626953, "objective/non_score_reward": -2.685290813446045, "objective/rlhf_reward": -8.3411630153656, "objective/scores": 0.6, "policy/approxkl_avg": 1.3184102773666382, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681640625, "step": 993, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005533695220947 }, { "episode": 15920, "epoch": 0.09538531593390133, "loss/policy_avg": 0.3695295453071594, "lr": 9.364775051124744e-06, "objective/entropy": -288.468505859375, "objective/kl": 32.96984100341797, "objective/non_score_reward": -1.6484923362731934, "objective/rlhf_reward": -5.0781975624882545, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 3.1653892993927, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.642578125, "step": 994, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999734878540039 }, { "episode": 15936, "epoch": 0.09548118057303089, "loss/policy_avg": 0.3992432951927185, "lr": 9.364135991820041e-06, "objective/entropy": -231.646728515625, "objective/kl": 34.67195510864258, "objective/non_score_reward": -1.733597755432129, "objective/rlhf_reward": -5.510559280117121, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 19.767539978027344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.849609375, "step": 995, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9987037181854248 }, { "episode": 15952, "epoch": 0.09557704521216043, "loss/policy_avg": 0.03356311097741127, "lr": 9.363496932515338e-06, "objective/entropy": -210.72410583496094, "objective/kl": 27.1010799407959, "objective/non_score_reward": -1.3550540208816528, "objective/rlhf_reward": -3.595387215885233, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 1.0958271026611328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7265625, "step": 996, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994826316833496 }, { "episode": 15968, "epoch": 0.09567290985128998, "loss/policy_avg": 1.1218140125274658, "lr": 9.362857873210635e-06, "objective/entropy": -71.63316345214844, "objective/kl": 40.19666290283203, "objective/non_score_reward": -2.009833335876465, "objective/rlhf_reward": -8.03933310508728, "objective/scores": 0.0, "policy/approxkl_avg": 3.4838500022888184, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.66015625, "step": 997, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991953372955322 }, { "episode": 15984, "epoch": 0.09576877449041953, "loss/policy_avg": 0.23440885543823242, "lr": 9.362218813905932e-06, "objective/entropy": -217.69229125976562, "objective/kl": 26.445728302001953, "objective/non_score_reward": -1.3222863674163818, "objective/rlhf_reward": -3.773374044688877, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 11.445338249206543, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.623046875, "step": 998, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9970027208328247 }, { "episode": 16000, "epoch": 0.09586463912954908, "loss/policy_avg": -0.3169388175010681, "lr": 9.361579754601227e-06, "objective/entropy": -116.28077697753906, "objective/kl": 44.722564697265625, "objective/non_score_reward": -2.236128091812134, "objective/rlhf_reward": -6.997101019101079, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.412589073181152, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.734375, "step": 999, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000357151031494 } ], "logging_steps": 500, "max_steps": 7824, "num_input_tokens_seen": 0, "num_train_epochs": 3.0, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": true, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0, "train_batch_size": null, "trial_name": null, "trial_params": null }