diff --git "a/checkpoint-1500/trainer_state.json" "b/checkpoint-1500/trainer_state.json" deleted file mode 100644--- "a/checkpoint-1500/trainer_state.json" +++ /dev/null @@ -1,27034 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "episode": 24000, - "epoch": 0.14379695869432363, - "eval_steps": 500, - "global_step": 1500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "episode": 16, - "epoch": 9.586463912954908e-05, - "loss/policy_avg": 0.015691569074988365, - "lr": 1e-05, - "objective/entropy": 136.889404296875, - "objective/kl": 13.172518730163574, - "objective/non_score_reward": -0.6586259603500366, - "objective/rlhf_reward": -1.2559016580260813, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 330.0568542480469, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.75, - "step": 0, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999426007270813 - }, - { - "episode": 32, - "epoch": 0.00019172927825909816, - "loss/policy_avg": 0.021727558225393295, - "lr": 9.999360940695298e-06, - "objective/entropy": -4.705432891845703, - "objective/kl": 4.4086012840271, - "objective/non_score_reward": -0.22043009102344513, - "objective/rlhf_reward": 0.49688179692854306, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 25.247615814208984, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4375, - "step": 1, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0005669593811035 - }, - { - "episode": 48, - "epoch": 0.00028759391738864725, - "loss/policy_avg": 0.05422616004943848, - "lr": 9.998721881390595e-06, - "objective/entropy": 26.511795043945312, - "objective/kl": 10.364278793334961, - "objective/non_score_reward": -0.5182140469551086, - "objective/rlhf_reward": -0.6222579917923059, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 174.7788543701172, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6328125, - "step": 2, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001580238342285 - }, - { - "episode": 64, - "epoch": 0.0003834585565181963, - "loss/policy_avg": 0.1031150370836258, - "lr": 9.99808282208589e-06, - "objective/entropy": -6.2874298095703125, - "objective/kl": 7.10389518737793, - "objective/non_score_reward": -0.35519474744796753, - "objective/rlhf_reward": 0.24108044284523888, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 107.51742553710938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.806640625, - "step": 3, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9999995231628418 - }, - { - "episode": 80, - "epoch": 0.0004793231956477454, - "loss/policy_avg": 0.020609447732567787, - "lr": 9.997443762781187e-06, - "objective/entropy": 63.54547882080078, - "objective/kl": 1.458254337310791, - "objective/non_score_reward": -0.07291271537542343, - "objective/rlhf_reward": 1.224120924828116, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 14.240117073059082, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4150390625, - "step": 4, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000563383102417 - }, - { - "episode": 96, - "epoch": 0.0005751878347772945, - "loss/policy_avg": 0.1277482807636261, - "lr": 9.996804703476484e-06, - "objective/entropy": 55.068546295166016, - "objective/kl": 8.753851890563965, - "objective/non_score_reward": -0.43769264221191406, - "objective/rlhf_reward": -0.37216834077010735, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 100.08578491210938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.447265625, - "step": 5, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999474048614502 - }, - { - "episode": 112, - "epoch": 0.0006710524739068436, - "loss/policy_avg": 0.3148539662361145, - "lr": 9.99616564417178e-06, - "objective/entropy": 21.463600158691406, - "objective/kl": 9.847577095031738, - "objective/non_score_reward": -0.4923788607120514, - "objective/rlhf_reward": -0.02210425861352272, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 82.89840698242188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.619140625, - "step": 6, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998772382736206 - }, - { - "episode": 128, - "epoch": 0.0007669171130363926, - "loss/policy_avg": -9.760260581970215e-06, - "lr": 9.995526584867077e-06, - "objective/entropy": 43.514984130859375, - "objective/kl": 6.468422889709473, - "objective/non_score_reward": -0.3234211802482605, - "objective/rlhf_reward": 0.18726797867262368, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 53.660911560058594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.595703125, - "step": 7, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0024185180664062 - }, - { - "episode": 144, - "epoch": 0.0008627817521659417, - "loss/policy_avg": 0.07420124113559723, - "lr": 9.994887525562374e-06, - "objective/entropy": 111.558837890625, - "objective/kl": 5.765064716339111, - "objective/non_score_reward": -0.2882532477378845, - "objective/rlhf_reward": 0.7943982454372089, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 38.34186935424805, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4462890625, - "step": 8, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9975563287734985 - }, - { - "episode": 160, - "epoch": 0.0009586463912954908, - "loss/policy_avg": 0.22252294421195984, - "lr": 9.99424846625767e-06, - "objective/entropy": 99.2086181640625, - "objective/kl": 8.770297050476074, - "objective/non_score_reward": -0.4385148584842682, - "objective/rlhf_reward": -0.35405938923358926, - "objective/scores": 0.35, - "policy/approxkl_avg": 98.07421112060547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.75, - "step": 9, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9961905479431152 - }, - { - "episode": 176, - "epoch": 0.0010545110304250398, - "loss/policy_avg": 0.05278925597667694, - "lr": 9.993609406952966e-06, - "objective/entropy": 192.25936889648438, - "objective/kl": 5.483057975769043, - "objective/non_score_reward": -0.27415287494659424, - "objective/rlhf_reward": 1.3033885151147842, - "objective/scores": 0.6, - "policy/approxkl_avg": 54.852699279785156, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.73046875, - "step": 10, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001378059387207 - }, - { - "episode": 192, - "epoch": 0.001150375669554589, - "loss/policy_avg": 0.01604432426393032, - "lr": 9.992970347648263e-06, - "objective/entropy": 91.4354476928711, - "objective/kl": 1.6482281684875488, - "objective/non_score_reward": -0.08241140842437744, - "objective/rlhf_reward": 1.1513069728358984, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 12.662862777709961, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5390625, - "step": 11, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994404315948486 - }, - { - "episode": 208, - "epoch": 0.001246240308684138, - "loss/policy_avg": 0.17367278039455414, - "lr": 9.992331288343558e-06, - "objective/entropy": 148.37680053710938, - "objective/kl": 9.977045059204102, - "objective/non_score_reward": -0.4988522529602051, - "objective/rlhf_reward": -0.4796372515880427, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 132.6361083984375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4619140625, - "step": 12, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9963808059692383 - }, - { - "episode": 224, - "epoch": 0.0013421049478136871, - "loss/policy_avg": -0.12138635665178299, - "lr": 9.991692229038855e-06, - "objective/entropy": -70.20156860351562, - "objective/kl": 3.8376624584198, - "objective/non_score_reward": -0.1918831169605255, - "objective/rlhf_reward": 0.6324675619602202, - "objective/scores": 0.35, - "policy/approxkl_avg": 15.127391815185547, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.505859375, - "step": 13, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.017153739929199 - }, - { - "episode": 240, - "epoch": 0.001437969586943236, - "loss/policy_avg": 0.1106414794921875, - "lr": 9.991053169734152e-06, - "objective/entropy": 129.54013061523438, - "objective/kl": 12.085613250732422, - "objective/non_score_reward": -0.6042807102203369, - "objective/rlhf_reward": -0.6837895224491755, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 178.22561645507812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5390625, - "step": 14, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999481201171875 - }, - { - "episode": 256, - "epoch": 0.0015338342260727853, - "loss/policy_avg": 0.01672934927046299, - "lr": 9.990414110429449e-06, - "objective/entropy": 177.98126220703125, - "objective/kl": 7.125063896179199, - "objective/non_score_reward": -0.3562532067298889, - "objective/rlhf_reward": -0.025012841820716947, - "objective/scores": 0.35, - "policy/approxkl_avg": 91.47238159179688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.716796875, - "step": 15, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000582218170166 - }, - { - "episode": 272, - "epoch": 0.0016296988652023342, - "loss/policy_avg": 0.14258402585983276, - "lr": 9.989775051124744e-06, - "objective/entropy": 197.2217559814453, - "objective/kl": 12.70147705078125, - "objective/non_score_reward": -0.6350738406181335, - "objective/rlhf_reward": -1.1616931343949852, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 84.26277160644531, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.630859375, - "step": 16, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9964921474456787 - }, - { - "episode": 288, - "epoch": 0.0017255635043318834, - "loss/policy_avg": -0.0007228106260299683, - "lr": 9.989135991820041e-06, - "objective/entropy": -9.756143569946289, - "objective/kl": 7.940765380859375, - "objective/non_score_reward": -0.3970382809638977, - "objective/rlhf_reward": -0.07238138595455501, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 42.61369323730469, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.64453125, - "step": 17, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0011234283447266 - }, - { - "episode": 304, - "epoch": 0.0018214281434614326, - "loss/policy_avg": 0.13892704248428345, - "lr": 9.988496932515338e-06, - "objective/entropy": 14.549068450927734, - "objective/kl": 9.783748626708984, - "objective/non_score_reward": -0.48918741941452026, - "objective/rlhf_reward": -0.5781475538886606, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 73.81009674072266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.607421875, - "step": 18, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998319149017334 - }, - { - "episode": 320, - "epoch": 0.0019172927825909815, - "loss/policy_avg": 0.12347989529371262, - "lr": 9.987857873210635e-06, - "objective/entropy": 197.0328369140625, - "objective/kl": 9.07555103302002, - "objective/non_score_reward": -0.453777551651001, - "objective/rlhf_reward": -0.15325071436225013, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 74.28388214111328, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5625, - "step": 19, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.001952648162842 - }, - { - "episode": 336, - "epoch": 0.0020131574217205307, - "loss/policy_avg": 0.06666804850101471, - "lr": 9.987218813905932e-06, - "objective/entropy": 180.56707763671875, - "objective/kl": 10.346174240112305, - "objective/non_score_reward": -0.5173087120056152, - "objective/rlhf_reward": -0.6454025848704257, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 88.01742553710938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.595703125, - "step": 20, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9958171844482422 - }, - { - "episode": 352, - "epoch": 0.0021090220608500796, - "loss/policy_avg": 0.12632718682289124, - "lr": 9.986579754601228e-06, - "objective/entropy": 165.49900817871094, - "objective/kl": 10.707776069641113, - "objective/non_score_reward": -0.5353888273239136, - "objective/rlhf_reward": -0.7629530663169442, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 118.42108917236328, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.78125, - "step": 21, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9964122772216797 - }, - { - "episode": 368, - "epoch": 0.0022048866999796286, - "loss/policy_avg": 0.012576747685670853, - "lr": 9.985940695296524e-06, - "objective/entropy": -133.83059692382812, - "objective/kl": 6.06254768371582, - "objective/non_score_reward": -0.3031274080276489, - "objective/rlhf_reward": 0.21132251183215, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 4.497255325317383, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.552734375, - "step": 22, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0017709732055664 - }, - { - "episode": 384, - "epoch": 0.002300751339109178, - "loss/policy_avg": 0.21566970646381378, - "lr": 9.98530163599182e-06, - "objective/entropy": 80.05180358886719, - "objective/kl": 18.019107818603516, - "objective/non_score_reward": -0.9009554386138916, - "objective/rlhf_reward": -2.1799896850186267, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 244.3957061767578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.72265625, - "step": 23, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975435733795166 - }, - { - "episode": 400, - "epoch": 0.002396615978238727, - "loss/policy_avg": 0.21825431287288666, - "lr": 9.984662576687117e-06, - "objective/entropy": 22.858154296875, - "objective/kl": 7.889187812805176, - "objective/non_score_reward": -0.39445942640304565, - "objective/rlhf_reward": 0.5448686011871957, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 45.33286666870117, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.54296875, - "step": 24, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9998900890350342 - }, - { - "episode": 416, - "epoch": 0.002492480617368276, - "loss/policy_avg": 0.2645857036113739, - "lr": 9.984023517382414e-06, - "objective/entropy": 37.619895935058594, - "objective/kl": 11.23090934753418, - "objective/non_score_reward": -0.5615454316139221, - "objective/rlhf_reward": 0.15381827354431143, - "objective/scores": 0.6, - "policy/approxkl_avg": 88.95787811279297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.552734375, - "step": 25, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.996498703956604 - }, - { - "episode": 432, - "epoch": 0.002588345256497825, - "loss/policy_avg": 0.04753335565328598, - "lr": 9.983384458077711e-06, - "objective/entropy": 156.34921264648438, - "objective/kl": 7.371222496032715, - "objective/non_score_reward": -0.36856111884117126, - "objective/rlhf_reward": -0.14873159292332616, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 35.437461853027344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015625, - "step": 26, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9979305267333984 - }, - { - "episode": 448, - "epoch": 0.0026842098956273742, - "loss/policy_avg": -0.010932949371635914, - "lr": 9.982745398773006e-06, - "objective/entropy": 16.393407821655273, - "objective/kl": 16.967132568359375, - "objective/non_score_reward": -0.8483567237854004, - "objective/rlhf_reward": -2.051791122465759, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 207.71142578125, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.564453125, - "step": 27, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9935011863708496 - }, - { - "episode": 464, - "epoch": 0.002780074534756923, - "loss/policy_avg": 0.23893436789512634, - "lr": 9.982106339468303e-06, - "objective/entropy": 170.59136962890625, - "objective/kl": 15.129783630371094, - "objective/non_score_reward": -0.7564891576766968, - "objective/rlhf_reward": -1.469697265830591, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 135.97763061523438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.72265625, - "step": 28, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9975056648254395 - }, - { - "episode": 480, - "epoch": 0.002875939173886472, - "loss/policy_avg": 0.03272615000605583, - "lr": 9.9814672801636e-06, - "objective/entropy": 6.700323104858398, - "objective/kl": 10.701581954956055, - "objective/non_score_reward": -0.5350791215896606, - "objective/rlhf_reward": -0.6897181971982564, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 63.513145446777344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.60546875, - "step": 29, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998331069946289 - }, - { - "episode": 496, - "epoch": 0.0029718038130160216, - "loss/policy_avg": 0.07188314199447632, - "lr": 9.980828220858897e-06, - "objective/entropy": -47.331199645996094, - "objective/kl": 12.874979019165039, - "objective/non_score_reward": -0.6437489986419678, - "objective/rlhf_reward": -1.1963937664903224, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 77.876220703125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5390625, - "step": 30, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9967491626739502 - }, - { - "episode": 512, - "epoch": 0.0030676684521455705, - "loss/policy_avg": 0.04047826677560806, - "lr": 9.980189161554194e-06, - "objective/entropy": 282.3853759765625, - "objective/kl": 9.654375076293945, - "objective/non_score_reward": -0.4827187657356262, - "objective/rlhf_reward": -0.5716251668676566, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 64.11791229248047, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.89453125, - "step": 31, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9997191429138184 - }, - { - "episode": 528, - "epoch": 0.0031635330912751195, - "loss/policy_avg": 0.07097287476062775, - "lr": 9.97955010224949e-06, - "objective/entropy": 116.042236328125, - "objective/kl": 14.595599174499512, - "objective/non_score_reward": -0.7297799587249756, - "objective/rlhf_reward": -0.7964137478926516, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 272.6925048828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3857421875, - "step": 32, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0027289390563965 - }, - { - "episode": 544, - "epoch": 0.0032593977304046684, - "loss/policy_avg": 0.5246497392654419, - "lr": 9.978911042944786e-06, - "objective/entropy": 8.318304061889648, - "objective/kl": 16.622827529907227, - "objective/non_score_reward": -0.831141471862793, - "objective/rlhf_reward": -1.9990529752074906, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 159.0550079345703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.703125, - "step": 33, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9971305131912231 - }, - { - "episode": 560, - "epoch": 0.003355262369534218, - "loss/policy_avg": 0.20073390007019043, - "lr": 9.978271983640083e-06, - "objective/entropy": 92.97464752197266, - "objective/kl": 10.66767692565918, - "objective/non_score_reward": -0.5333837866783142, - "objective/rlhf_reward": 2.2664648383855823, - "objective/scores": 1.1, - "policy/approxkl_avg": 89.14144134521484, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.544921875, - "step": 34, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000143051147461 - }, - { - "episode": 576, - "epoch": 0.0034511270086637668, - "loss/policy_avg": 0.04765152558684349, - "lr": 9.977632924335378e-06, - "objective/entropy": 149.43089294433594, - "objective/kl": 16.67333221435547, - "objective/non_score_reward": -0.8336665630340576, - "objective/rlhf_reward": -0.9346663713455201, - "objective/scores": 0.6, - "policy/approxkl_avg": 189.3590850830078, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4765625, - "step": 35, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9986698627471924 - }, - { - "episode": 592, - "epoch": 0.0035469916477933157, - "loss/policy_avg": 0.40008074045181274, - "lr": 9.976993865030675e-06, - "objective/entropy": 157.10501098632812, - "objective/kl": 13.927867889404297, - "objective/non_score_reward": -0.6963933706283569, - "objective/rlhf_reward": -1.406971328941685, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 121.78231811523438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.70703125, - "step": 36, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9974275827407837 - }, - { - "episode": 608, - "epoch": 0.003642856286922865, - "loss/policy_avg": 0.08663024008274078, - "lr": 9.976354805725972e-06, - "objective/entropy": 47.76446533203125, - "objective/kl": 13.560833930969238, - "objective/non_score_reward": -0.6780416965484619, - "objective/rlhf_reward": -0.5894605539002753, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 43.71810531616211, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5078125, - "step": 37, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991490840911865 - }, - { - "episode": 624, - "epoch": 0.003738720926052414, - "loss/policy_avg": 0.08268876373767853, - "lr": 9.975715746421269e-06, - "objective/entropy": 192.41729736328125, - "objective/kl": 6.687016010284424, - "objective/non_score_reward": -0.3343508243560791, - "objective/rlhf_reward": 0.021846643354015427, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 67.82701873779297, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.619140625, - "step": 38, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999939203262329 - }, - { - "episode": 640, - "epoch": 0.003834585565181963, - "loss/policy_avg": 0.05995899811387062, - "lr": 9.975076687116566e-06, - "objective/entropy": -98.350341796875, - "objective/kl": 9.015666961669922, - "objective/non_score_reward": -0.450783371925354, - "objective/rlhf_reward": 0.14427768908268623, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 51.733055114746094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5859375, - "step": 39, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9974713325500488 - }, - { - "episode": 656, - "epoch": 0.003930450204311512, - "loss/policy_avg": 0.18854951858520508, - "lr": 9.97443762781186e-06, - "objective/entropy": 141.67947387695312, - "objective/kl": 10.309185028076172, - "objective/non_score_reward": -0.5154592990875244, - "objective/rlhf_reward": -0.6618371069431306, - "objective/scores": 0.35, - "policy/approxkl_avg": 71.02857208251953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.744140625, - "step": 40, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993364810943604 - }, - { - "episode": 672, - "epoch": 0.004026314843441061, - "loss/policy_avg": 0.05062849074602127, - "lr": 9.973798568507158e-06, - "objective/entropy": -38.6858024597168, - "objective/kl": 9.445882797241211, - "objective/non_score_reward": -0.4722941517829895, - "objective/rlhf_reward": -1.8891766667366028, - "objective/scores": 0.0, - "policy/approxkl_avg": 5.4856438636779785, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.62890625, - "step": 41, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9984209537506104 - }, - { - "episode": 688, - "epoch": 0.00412217948257061, - "loss/policy_avg": 0.09501229226589203, - "lr": 9.973159509202454e-06, - "objective/entropy": 17.35771942138672, - "objective/kl": 10.873266220092773, - "objective/non_score_reward": -0.5436632633209229, - "objective/rlhf_reward": -0.44131985406080876, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 98.38662719726562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6953125, - "step": 42, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9995697736740112 - }, - { - "episode": 704, - "epoch": 0.004218044121700159, - "loss/policy_avg": 0.32498252391815186, - "lr": 9.972520449897751e-06, - "objective/entropy": 174.98866271972656, - "objective/kl": 11.279447555541992, - "objective/non_score_reward": -0.5639723539352417, - "objective/rlhf_reward": -0.7749369321421384, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 62.73210144042969, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.552734375, - "step": 43, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0007286071777344 - }, - { - "episode": 720, - "epoch": 0.004313908760829708, - "loss/policy_avg": 0.3995896577835083, - "lr": 9.971881390593048e-06, - "objective/entropy": 36.609832763671875, - "objective/kl": 19.769756317138672, - "objective/non_score_reward": -0.9884878993034363, - "objective/rlhf_reward": -2.1291227295723667, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 164.33892822265625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.708984375, - "step": 44, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9945415258407593 - }, - { - "episode": 736, - "epoch": 0.004409773399959257, - "loss/policy_avg": 0.17710548639297485, - "lr": 9.971242331288345e-06, - "objective/entropy": 93.23808288574219, - "objective/kl": 16.88797378540039, - "objective/non_score_reward": -0.8443987965583801, - "objective/rlhf_reward": -1.7157356492882831, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 54.64923858642578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.779296875, - "step": 45, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981857538223267 - }, - { - "episode": 752, - "epoch": 0.004505638039088807, - "loss/policy_avg": 0.32767364382743835, - "lr": 9.97060327198364e-06, - "objective/entropy": 202.11843872070312, - "objective/kl": 14.050471305847168, - "objective/non_score_reward": -0.7025235295295715, - "objective/rlhf_reward": -1.484581295281572, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 76.14016723632812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7421875, - "step": 46, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9997007846832275 - }, - { - "episode": 768, - "epoch": 0.004601502678218356, - "loss/policy_avg": 0.08174459636211395, - "lr": 9.969964212678937e-06, - "objective/entropy": 54.37752151489258, - "objective/kl": 15.1139497756958, - "objective/non_score_reward": -0.75569748878479, - "objective/rlhf_reward": -1.6635400888666343, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 83.4612045288086, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4296875, - "step": 47, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9972434043884277 - }, - { - "episode": 784, - "epoch": 0.004697367317347905, - "loss/policy_avg": 0.03365965187549591, - "lr": 9.969325153374234e-06, - "objective/entropy": 85.39935302734375, - "objective/kl": 13.452342987060547, - "objective/non_score_reward": -0.6726170778274536, - "objective/rlhf_reward": -0.74305723138326, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 61.629390716552734, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.572265625, - "step": 48, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998470544815063 - }, - { - "episode": 800, - "epoch": 0.004793231956477454, - "loss/policy_avg": 0.009335246868431568, - "lr": 9.968686094069531e-06, - "objective/entropy": 288.22564697265625, - "objective/kl": 19.127742767333984, - "objective/non_score_reward": -0.9563871026039124, - "objective/rlhf_reward": -0.9018295153391089, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 176.43731689453125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.892578125, - "step": 49, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9936624765396118 - }, - { - "episode": 816, - "epoch": 0.004889096595607003, - "loss/policy_avg": 0.13336139917373657, - "lr": 9.968047034764828e-06, - "objective/entropy": -38.686851501464844, - "objective/kl": 18.06523895263672, - "objective/non_score_reward": -0.9032620191574097, - "objective/rlhf_reward": -2.1320952503041024, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 179.73486328125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.65625, - "step": 50, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.996016263961792 - }, - { - "episode": 832, - "epoch": 0.004984961234736552, - "loss/policy_avg": 0.09758515655994415, - "lr": 9.967407975460123e-06, - "objective/entropy": -32.55284881591797, - "objective/kl": 10.72513198852539, - "objective/non_score_reward": -0.5362565517425537, - "objective/rlhf_reward": -0.721194286544887, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 44.48727798461914, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.736328125, - "step": 51, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9976041316986084 - }, - { - "episode": 848, - "epoch": 0.005080825873866101, - "loss/policy_avg": 0.5202991366386414, - "lr": 9.96676891615542e-06, - "objective/entropy": 45.2802734375, - "objective/kl": 16.129152297973633, - "objective/non_score_reward": -0.8064576387405396, - "objective/rlhf_reward": -1.2784193260239918, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 124.33740234375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.623046875, - "step": 52, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978928565979004 - }, - { - "episode": 864, - "epoch": 0.00517669051299565, - "loss/policy_avg": 0.28677505254745483, - "lr": 9.966129856850717e-06, - "objective/entropy": -76.81179809570312, - "objective/kl": 15.223251342773438, - "objective/non_score_reward": -0.761162519454956, - "objective/rlhf_reward": -1.5288782207094989, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 69.77767944335938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7890625, - "step": 53, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9999229907989502 - }, - { - "episode": 880, - "epoch": 0.0052725551521251995, - "loss/policy_avg": 0.20859162509441376, - "lr": 9.965490797546014e-06, - "objective/entropy": -21.344478607177734, - "objective/kl": 10.70494556427002, - "objective/non_score_reward": -0.535247266292572, - "objective/rlhf_reward": -0.7623869264997064, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 98.75808715820312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.56640625, - "step": 54, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9975996017456055 - }, - { - "episode": 896, - "epoch": 0.0053684197912547485, - "loss/policy_avg": 1.2579694986343384, - "lr": 9.96485173824131e-06, - "objective/entropy": 164.7299346923828, - "objective/kl": 18.096805572509766, - "objective/non_score_reward": -0.9048402309417725, - "objective/rlhf_reward": -2.0152409709134873, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 95.78445434570312, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.736328125, - "step": 55, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9966285228729248 - }, - { - "episode": 912, - "epoch": 0.0054642844303842975, - "loss/policy_avg": 0.3564913868904114, - "lr": 9.964212678936606e-06, - "objective/entropy": 85.46858215332031, - "objective/kl": 17.930484771728516, - "objective/non_score_reward": -0.89652419090271, - "objective/rlhf_reward": -1.4633905313172677, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 79.41477966308594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4091796875, - "step": 56, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984946250915527 - }, - { - "episode": 928, - "epoch": 0.005560149069513846, - "loss/policy_avg": 0.03960660099983215, - "lr": 9.963573619631903e-06, - "objective/entropy": 205.954833984375, - "objective/kl": 17.15917205810547, - "objective/non_score_reward": -0.8579585552215576, - "objective/rlhf_reward": -1.3091281972089148, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 23.591196060180664, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.791015625, - "step": 57, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997645378112793 - }, - { - "episode": 944, - "epoch": 0.005656013708643395, - "loss/policy_avg": -0.00983378104865551, - "lr": 9.9629345603272e-06, - "objective/entropy": -1.1022186279296875, - "objective/kl": 16.26142692565918, - "objective/non_score_reward": -0.8130713105201721, - "objective/rlhf_reward": 1.1477148175239567, - "objective/scores": 1.1, - "policy/approxkl_avg": 81.65092468261719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.552734375, - "step": 58, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.99957275390625 - }, - { - "episode": 960, - "epoch": 0.005751878347772944, - "loss/policy_avg": 0.32060182094573975, - "lr": 9.962295501022495e-06, - "objective/entropy": 48.09014892578125, - "objective/kl": 7.438636302947998, - "objective/non_score_reward": -0.3719318211078644, - "objective/rlhf_reward": 0.6349789739391469, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 7.77626895904541, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.822265625, - "step": 59, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.008031129837036 - }, - { - "episode": 976, - "epoch": 0.005847742986902493, - "loss/policy_avg": 0.2516993582248688, - "lr": 9.961656441717792e-06, - "objective/entropy": -46.64883804321289, - "objective/kl": 19.601835250854492, - "objective/non_score_reward": -0.9800918102264404, - "objective/rlhf_reward": -2.594854134946985, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 181.5974578857422, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.609375, - "step": 60, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9988071918487549 - }, - { - "episode": 992, - "epoch": 0.005943607626032043, - "loss/policy_avg": 0.1109720841050148, - "lr": 9.961017382413088e-06, - "objective/entropy": 97.6422348022461, - "objective/kl": 13.844486236572266, - "objective/non_score_reward": -0.692224383354187, - "objective/rlhf_reward": -1.2126380791335847, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 96.34603118896484, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.583984375, - "step": 61, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9974918365478516 - }, - { - "episode": 1008, - "epoch": 0.006039472265161592, - "loss/policy_avg": -0.05115126073360443, - "lr": 9.960378323108385e-06, - "objective/entropy": 34.42061996459961, - "objective/kl": 14.079090118408203, - "objective/non_score_reward": -0.7039545774459839, - "objective/rlhf_reward": -1.4565682944997977, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 49.87873840332031, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.677734375, - "step": 62, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982357025146484 - }, - { - "episode": 1024, - "epoch": 0.006135336904291141, - "loss/policy_avg": 0.22280101478099823, - "lr": 9.959739263803682e-06, - "objective/entropy": -24.89067840576172, - "objective/kl": 19.501176834106445, - "objective/non_score_reward": -0.9750589728355408, - "objective/rlhf_reward": -2.4496376319841, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 243.47512817382812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.888671875, - "step": 63, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999916911125183 - }, - { - "episode": 1040, - "epoch": 0.00623120154342069, - "loss/policy_avg": 0.36840492486953735, - "lr": 9.959100204498979e-06, - "objective/entropy": 134.6929931640625, - "objective/kl": 22.332670211791992, - "objective/non_score_reward": -1.1166335344314575, - "objective/rlhf_reward": -2.641705389293741, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 136.65045166015625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65234375, - "step": 64, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9981276988983154 - }, - { - "episode": 1056, - "epoch": 0.006327066182550239, - "loss/policy_avg": 0.09098342061042786, - "lr": 9.958461145194274e-06, - "objective/entropy": -26.864063262939453, - "objective/kl": 13.052759170532227, - "objective/non_score_reward": -0.6526379585266113, - "objective/rlhf_reward": -0.7857228770580997, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 62.885929107666016, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.603515625, - "step": 65, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.997183084487915 - }, - { - "episode": 1072, - "epoch": 0.006422930821679788, - "loss/policy_avg": 0.27086368203163147, - "lr": 9.957822085889571e-06, - "objective/entropy": -58.01667404174805, - "objective/kl": 16.48623275756836, - "objective/non_score_reward": -0.8243115544319153, - "objective/rlhf_reward": -1.635386770189391, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 153.92050170898438, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.583984375, - "step": 66, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0005505084991455 - }, - { - "episode": 1088, - "epoch": 0.006518795460809337, - "loss/policy_avg": 1.2388324737548828, - "lr": 9.957183026584868e-06, - "objective/entropy": 99.91399383544922, - "objective/kl": 21.524110794067383, - "objective/non_score_reward": -1.0762056112289429, - "objective/rlhf_reward": -2.6429626993542774, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 170.69760131835938, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.64453125, - "step": 67, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9946773052215576 - }, - { - "episode": 1104, - "epoch": 0.006614660099938887, - "loss/policy_avg": 0.330521821975708, - "lr": 9.956543967280165e-06, - "objective/entropy": -76.99481201171875, - "objective/kl": 15.58948802947998, - "objective/non_score_reward": -0.7794743776321411, - "objective/rlhf_reward": -1.7178976856172086, - "objective/scores": 0.35, - "policy/approxkl_avg": 218.45574951171875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.64453125, - "step": 68, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9997715950012207 - }, - { - "episode": 1120, - "epoch": 0.006710524739068436, - "loss/policy_avg": 0.11920663714408875, - "lr": 9.955904907975462e-06, - "objective/entropy": 70.55160522460938, - "objective/kl": 20.134777069091797, - "objective/non_score_reward": -1.0067389011383057, - "objective/rlhf_reward": -2.6853197722727353, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 62.195674896240234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.34765625, - "step": 69, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001072406768799 - }, - { - "episode": 1136, - "epoch": 0.006806389378197985, - "loss/policy_avg": -0.17695794999599457, - "lr": 9.955265848670757e-06, - "objective/entropy": 101.99272918701172, - "objective/kl": 12.69788932800293, - "objective/non_score_reward": -0.6348943710327148, - "objective/rlhf_reward": -2.539577692747116, - "objective/scores": 0.0, - "policy/approxkl_avg": 64.835693359375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.44140625, - "step": 70, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0115315914154053 - }, - { - "episode": 1152, - "epoch": 0.0069022540173275335, - "loss/policy_avg": 0.35137245059013367, - "lr": 9.954626789366054e-06, - "objective/entropy": 79.80499267578125, - "objective/kl": 21.120101928710938, - "objective/non_score_reward": -1.0560050010681152, - "objective/rlhf_reward": -2.1013141296067577, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 124.16864776611328, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.68359375, - "step": 71, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998917818069458 - }, - { - "episode": 1168, - "epoch": 0.0069981186564570825, - "loss/policy_avg": 0.07422849535942078, - "lr": 9.95398773006135e-06, - "objective/entropy": 9.376724243164062, - "objective/kl": 15.093628883361816, - "objective/non_score_reward": -0.7546814680099487, - "objective/rlhf_reward": -1.6594760653719138, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 47.567962646484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.65234375, - "step": 72, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9958832263946533 - }, - { - "episode": 1184, - "epoch": 0.0070939832955866314, - "loss/policy_avg": 0.11969298124313354, - "lr": 9.953348670756648e-06, - "objective/entropy": 133.57423400878906, - "objective/kl": 20.2343807220459, - "objective/non_score_reward": -1.0117191076278687, - "objective/rlhf_reward": -1.1231571778070655, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 93.79672241210938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.423828125, - "step": 73, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0005669593811035 - }, - { - "episode": 1200, - "epoch": 0.00718984793471618, - "loss/policy_avg": 0.2395152747631073, - "lr": 9.952709611451944e-06, - "objective/entropy": 31.68697166442871, - "objective/kl": 20.96116828918457, - "objective/non_score_reward": -1.0480585098266602, - "objective/rlhf_reward": -2.711281481202006, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 194.83474731445312, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.669921875, - "step": 74, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9953577518463135 - }, - { - "episode": 1216, - "epoch": 0.00728571257384573, - "loss/policy_avg": 0.27856501936912537, - "lr": 9.952070552147241e-06, - "objective/entropy": 119.42091369628906, - "objective/kl": 11.30095100402832, - "objective/non_score_reward": -0.5650476217269897, - "objective/rlhf_reward": -0.9185547738367612, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 59.14590835571289, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.75, - "step": 75, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9953703880310059 - }, - { - "episode": 1232, - "epoch": 0.007381577212975279, - "loss/policy_avg": 0.21030786633491516, - "lr": 9.951431492842536e-06, - "objective/entropy": 7.310768127441406, - "objective/kl": 6.645857810974121, - "objective/non_score_reward": -0.3322928845882416, - "objective/rlhf_reward": 0.04943063011993787, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 14.611559867858887, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.591796875, - "step": 76, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9996391534805298 - }, - { - "episode": 1248, - "epoch": 0.007477441852104828, - "loss/policy_avg": 0.4117072820663452, - "lr": 9.950792433537833e-06, - "objective/entropy": -109.53082275390625, - "objective/kl": 11.825650215148926, - "objective/non_score_reward": -0.5912825465202332, - "objective/rlhf_reward": 0.03486987352371207, - "objective/scores": 0.6, - "policy/approxkl_avg": 19.0810604095459, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6171875, - "step": 77, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9981213808059692 - }, - { - "episode": 1264, - "epoch": 0.007573306491234377, - "loss/policy_avg": 0.2597622275352478, - "lr": 9.950153374233129e-06, - "objective/entropy": -29.7529296875, - "objective/kl": 18.43012809753418, - "objective/non_score_reward": -0.9215063452720642, - "objective/rlhf_reward": -2.2860254704952236, - "objective/scores": 0.35, - "policy/approxkl_avg": 267.2847900390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.70703125, - "step": 78, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.997544765472412 - }, - { - "episode": 1280, - "epoch": 0.007669171130363926, - "loss/policy_avg": 0.2407466471195221, - "lr": 9.949514314928425e-06, - "objective/entropy": 14.07373046875, - "objective/kl": 20.781753540039062, - "objective/non_score_reward": -1.0390876531600952, - "objective/rlhf_reward": -1.2326316579591956, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 147.4822235107422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.724609375, - "step": 79, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9987099170684814 - }, - { - "episode": 1296, - "epoch": 0.007765035769493475, - "loss/policy_avg": 0.17344285547733307, - "lr": 9.948875255623722e-06, - "objective/entropy": 112.44259643554688, - "objective/kl": 10.0985746383667, - "objective/non_score_reward": -0.504928708076477, - "objective/rlhf_reward": 0.38028510808944693, - "objective/scores": 0.6, - "policy/approxkl_avg": 4.8866167068481445, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.443359375, - "step": 80, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0143842697143555 - }, - { - "episode": 1312, - "epoch": 0.007860900408623025, - "loss/policy_avg": 0.14816004037857056, - "lr": 9.94823619631902e-06, - "objective/entropy": 67.11033630371094, - "objective/kl": 17.487518310546875, - "objective/non_score_reward": -0.8743758797645569, - "objective/rlhf_reward": -2.1558679251963193, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 18.69343376159668, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4619140625, - "step": 81, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998113751411438 - }, - { - "episode": 1328, - "epoch": 0.007956765047752574, - "loss/policy_avg": 0.2536642849445343, - "lr": 9.947597137014316e-06, - "objective/entropy": -71.85224914550781, - "objective/kl": 11.223343849182129, - "objective/non_score_reward": -0.5611672401428223, - "objective/rlhf_reward": -0.7637163875654935, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 37.78028869628906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.48828125, - "step": 82, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0003702640533447 - }, - { - "episode": 1344, - "epoch": 0.008052629686882123, - "loss/policy_avg": 0.3479039669036865, - "lr": 9.946958077709611e-06, - "objective/entropy": 146.41241455078125, - "objective/kl": 20.458145141601562, - "objective/non_score_reward": -1.0229072570800781, - "objective/rlhf_reward": -2.732379042838497, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 64.28889465332031, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.705078125, - "step": 83, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9976742267608643 - }, - { - "episode": 1360, - "epoch": 0.008148494326011672, - "loss/policy_avg": 0.10525624454021454, - "lr": 9.946319018404908e-06, - "objective/entropy": -43.42662048339844, - "objective/kl": 13.858359336853027, - "objective/non_score_reward": -0.6929180026054382, - "objective/rlhf_reward": -0.6489658228316642, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 61.37925720214844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.48828125, - "step": 84, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0012595653533936 - }, - { - "episode": 1376, - "epoch": 0.00824435896514122, - "loss/policy_avg": 0.3409525156021118, - "lr": 9.945679959100205e-06, - "objective/entropy": 1.5508041381835938, - "objective/kl": 19.05010223388672, - "objective/non_score_reward": -0.9525051116943359, - "objective/rlhf_reward": -2.205900583330708, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 97.6533203125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.63671875, - "step": 85, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000422477722168 - }, - { - "episode": 1392, - "epoch": 0.00834022360427077, - "loss/policy_avg": 0.3110717535018921, - "lr": 9.945040899795502e-06, - "objective/entropy": 215.75965881347656, - "objective/kl": 18.800819396972656, - "objective/non_score_reward": -0.9400409460067749, - "objective/rlhf_reward": -2.156043860975819, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 84.93620300292969, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.669921875, - "step": 86, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9962902069091797 - }, - { - "episode": 1408, - "epoch": 0.008436088243400319, - "loss/policy_avg": 0.02868543565273285, - "lr": 9.944401840490799e-06, - "objective/entropy": 154.10025024414062, - "objective/kl": 13.492873191833496, - "objective/non_score_reward": -0.6746436357498169, - "objective/rlhf_reward": -0.9652413214246431, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 42.483882904052734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.44921875, - "step": 87, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9983662366867065 - }, - { - "episode": 1424, - "epoch": 0.008531952882529868, - "loss/policy_avg": 0.07607420533895493, - "lr": 9.943762781186096e-06, - "objective/entropy": 202.40365600585938, - "objective/kl": 13.719297409057617, - "objective/non_score_reward": -0.685964822769165, - "objective/rlhf_reward": 1.6561407089233402, - "objective/scores": 1.1, - "policy/approxkl_avg": 20.57819175720215, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.728515625, - "step": 88, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999366283416748 - }, - { - "episode": 1440, - "epoch": 0.008627817521659416, - "loss/policy_avg": 0.16665664315223694, - "lr": 9.94312372188139e-06, - "objective/entropy": -100.20193481445312, - "objective/kl": 15.216776847839355, - "objective/non_score_reward": -0.7608388662338257, - "objective/rlhf_reward": -1.4392355120817002, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 85.36731719970703, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.626953125, - "step": 89, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9990196228027344 - }, - { - "episode": 1456, - "epoch": 0.008723682160788965, - "loss/policy_avg": 0.19817781448364258, - "lr": 9.942484662576688e-06, - "objective/entropy": -0.7409725189208984, - "objective/kl": 10.389724731445312, - "objective/non_score_reward": -0.5194862484931946, - "objective/rlhf_reward": 2.3220549762248996, - "objective/scores": 1.1, - "policy/approxkl_avg": 12.642692565917969, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.52734375, - "step": 90, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989277124404907 - }, - { - "episode": 1472, - "epoch": 0.008819546799918514, - "loss/policy_avg": 0.2365586757659912, - "lr": 9.941845603271985e-06, - "objective/entropy": 152.64306640625, - "objective/kl": 21.58309555053711, - "objective/non_score_reward": -1.0791547298431396, - "objective/rlhf_reward": -2.9573691723093223, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 87.72661590576172, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.771484375, - "step": 91, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999784231185913 - }, - { - "episode": 1488, - "epoch": 0.008915411439048063, - "loss/policy_avg": 0.059907689690589905, - "lr": 9.941206543967281e-06, - "objective/entropy": 89.6580810546875, - "objective/kl": 16.996726989746094, - "objective/non_score_reward": -0.8498364686965942, - "objective/rlhf_reward": -1.9755135669308581, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 72.40145874023438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.861328125, - "step": 92, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0003039836883545 - }, - { - "episode": 1504, - "epoch": 0.009011276078177614, - "loss/policy_avg": 0.14265713095664978, - "lr": 9.940567484662578e-06, - "objective/entropy": -33.708492279052734, - "objective/kl": 15.94516372680664, - "objective/non_score_reward": -0.797258198261261, - "objective/rlhf_reward": -0.2653137638580527, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 78.95989990234375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53125, - "step": 93, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.997810959815979 - }, - { - "episode": 1520, - "epoch": 0.009107140717307163, - "loss/policy_avg": -0.018713245168328285, - "lr": 9.939928425357874e-06, - "objective/entropy": -3.091245651245117, - "objective/kl": 14.482427597045898, - "objective/non_score_reward": -0.7241213917732239, - "objective/rlhf_reward": -1.2346261046534641, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 56.76847839355469, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.501953125, - "step": 94, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993091821670532 - }, - { - "episode": 1536, - "epoch": 0.009203005356436712, - "loss/policy_avg": -0.0069353943690657616, - "lr": 9.93928936605317e-06, - "objective/entropy": 95.46006774902344, - "objective/kl": 20.928672790527344, - "objective/non_score_reward": -1.046433687210083, - "objective/rlhf_reward": -2.360906060012888, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 103.58160400390625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58203125, - "step": 95, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9974098205566406 - }, - { - "episode": 1552, - "epoch": 0.009298869995566261, - "loss/policy_avg": 0.0523187518119812, - "lr": 9.938650306748467e-06, - "objective/entropy": 16.342994689941406, - "objective/kl": 20.205509185791016, - "objective/non_score_reward": -1.0102753639221191, - "objective/rlhf_reward": 0.35889836549758947, - "objective/scores": 1.1, - "policy/approxkl_avg": 84.55277252197266, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4697265625, - "step": 96, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000453233718872 - }, - { - "episode": 1568, - "epoch": 0.00939473463469581, - "loss/policy_avg": 0.18428044021129608, - "lr": 9.938011247443764e-06, - "objective/entropy": -31.386062622070312, - "objective/kl": 19.641075134277344, - "objective/non_score_reward": -0.9820537567138672, - "objective/rlhf_reward": -1.8055088541665412, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 92.56884002685547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.59765625, - "step": 97, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001230239868164 - }, - { - "episode": 1584, - "epoch": 0.009490599273825359, - "loss/policy_avg": -0.11768925935029984, - "lr": 9.937372188139061e-06, - "objective/entropy": -29.0854434967041, - "objective/kl": 16.647226333618164, - "objective/non_score_reward": -0.8323614001274109, - "objective/rlhf_reward": -1.9701957342371177, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.0866272449493408, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.541015625, - "step": 98, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0018460750579834 - }, - { - "episode": 1600, - "epoch": 0.009586463912954908, - "loss/policy_avg": 0.06727765500545502, - "lr": 9.936733128834358e-06, - "objective/entropy": 96.53413391113281, - "objective/kl": 21.015684127807617, - "objective/non_score_reward": -1.0507843494415283, - "objective/rlhf_reward": -2.8031371593475343, - "objective/scores": 0.35, - "policy/approxkl_avg": 36.56340026855469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.646484375, - "step": 99, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9949061870574951 - }, - { - "episode": 1616, - "epoch": 0.009682328552084457, - "loss/policy_avg": 0.28386813402175903, - "lr": 9.936094069529653e-06, - "objective/entropy": 33.901954650878906, - "objective/kl": 19.533782958984375, - "objective/non_score_reward": -0.9766892194747925, - "objective/rlhf_reward": -2.425804230387568, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 162.0339813232422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5859375, - "step": 100, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9985501766204834 - }, - { - "episode": 1632, - "epoch": 0.009778193191214006, - "loss/policy_avg": 0.11220409721136093, - "lr": 9.93545501022495e-06, - "objective/entropy": -3.93096923828125, - "objective/kl": 22.981700897216797, - "objective/non_score_reward": -1.1490850448608398, - "objective/rlhf_reward": -3.1725080504017744, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 46.0514030456543, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6328125, - "step": 101, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0070252418518066 - }, - { - "episode": 1648, - "epoch": 0.009874057830343555, - "loss/policy_avg": 0.20420242846012115, - "lr": 9.934815950920245e-06, - "objective/entropy": 198.98751831054688, - "objective/kl": 17.92270278930664, - "objective/non_score_reward": -0.8961352109909058, - "objective/rlhf_reward": -1.759712155136179, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 55.74137878417969, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.65234375, - "step": 102, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9980387687683105 - }, - { - "episode": 1664, - "epoch": 0.009969922469473104, - "loss/policy_avg": 0.27041423320770264, - "lr": 9.934176891615542e-06, - "objective/entropy": 1.5637626647949219, - "objective/kl": 12.633028030395508, - "objective/non_score_reward": -0.6316514015197754, - "objective/rlhf_reward": -0.7017769768563022, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 13.92137622833252, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4208984375, - "step": 103, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987752437591553 - }, - { - "episode": 1680, - "epoch": 0.010065787108602653, - "loss/policy_avg": 0.318324476480484, - "lr": 9.933537832310839e-06, - "objective/entropy": 218.76858520507812, - "objective/kl": 21.40100860595703, - "objective/non_score_reward": -1.0700504779815674, - "objective/rlhf_reward": -2.9385662584597165, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 90.99249267578125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.677734375, - "step": 104, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.998801827430725 - }, - { - "episode": 1696, - "epoch": 0.010161651747732202, - "loss/policy_avg": 0.3075984716415405, - "lr": 9.932898773006136e-06, - "objective/entropy": -56.81090545654297, - "objective/kl": 10.457717895507812, - "objective/non_score_reward": -0.5228859186172485, - "objective/rlhf_reward": -0.7129414687431871, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 48.63943862915039, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 105, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.995776653289795 - }, - { - "episode": 1712, - "epoch": 0.01025751638686175, - "loss/policy_avg": 0.5551585555076599, - "lr": 9.932259713701433e-06, - "objective/entropy": -48.12900924682617, - "objective/kl": 21.915470123291016, - "objective/non_score_reward": -1.0957735776901245, - "objective/rlhf_reward": -1.459375207067701, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 33.369083404541016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.71484375, - "step": 106, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.995157241821289 - }, - { - "episode": 1728, - "epoch": 0.0103533810259913, - "loss/policy_avg": 0.252463161945343, - "lr": 9.931620654396728e-06, - "objective/entropy": -69.64755249023438, - "objective/kl": 15.248108863830566, - "objective/non_score_reward": -0.7624054551124573, - "objective/rlhf_reward": -1.707986166983276, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 59.05755615234375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7578125, - "step": 107, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9963257312774658 - }, - { - "episode": 1744, - "epoch": 0.01044924566512085, - "loss/policy_avg": 0.13919854164123535, - "lr": 9.930981595092025e-06, - "objective/entropy": -133.55258178710938, - "objective/kl": 17.2213134765625, - "objective/non_score_reward": -0.8610656261444092, - "objective/rlhf_reward": -2.0850126979097556, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 32.41887664794922, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5234375, - "step": 108, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9992578029632568 - }, - { - "episode": 1760, - "epoch": 0.010545110304250399, - "loss/policy_avg": 0.5300755500793457, - "lr": 9.930342535787322e-06, - "objective/entropy": -9.471179962158203, - "objective/kl": 18.607471466064453, - "objective/non_score_reward": -0.9303736090660095, - "objective/rlhf_reward": -2.3214945554733273, - "objective/scores": 0.35, - "policy/approxkl_avg": 31.75185203552246, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.654296875, - "step": 109, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994070529937744 - }, - { - "episode": 1776, - "epoch": 0.010640974943379948, - "loss/policy_avg": 0.17107412219047546, - "lr": 9.929703476482619e-06, - "objective/entropy": 72.44110107421875, - "objective/kl": 16.862125396728516, - "objective/non_score_reward": -0.8431062698364258, - "objective/rlhf_reward": -3.372425138950348, - "objective/scores": 0.0, - "policy/approxkl_avg": 66.22834777832031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.70703125, - "step": 110, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.995293378829956 - }, - { - "episode": 1792, - "epoch": 0.010736839582509497, - "loss/policy_avg": -0.11443672329187393, - "lr": 9.929064417177915e-06, - "objective/entropy": 80.82670593261719, - "objective/kl": 18.79993438720703, - "objective/non_score_reward": -0.9399967789649963, - "objective/rlhf_reward": -2.336154927213756, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 31.270248413085938, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.5625, - "step": 111, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.007622241973877 - }, - { - "episode": 1808, - "epoch": 0.010832704221639046, - "loss/policy_avg": 0.0878123939037323, - "lr": 9.928425357873212e-06, - "objective/entropy": -118.92440795898438, - "objective/kl": 17.83495330810547, - "objective/non_score_reward": -0.8917477130889893, - "objective/rlhf_reward": -2.2253551392847593, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 20.88257598876953, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7421875, - "step": 112, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.996976613998413 - }, - { - "episode": 1824, - "epoch": 0.010928568860768595, - "loss/policy_avg": 0.18364591896533966, - "lr": 9.927786298568507e-06, - "objective/entropy": 8.144821166992188, - "objective/kl": 14.821235656738281, - "objective/non_score_reward": -0.741061806678772, - "objective/rlhf_reward": -1.2309138337771097, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 17.778968811035156, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 113, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000706911087036 - }, - { - "episode": 1840, - "epoch": 0.011024433499898144, - "loss/policy_avg": 0.06979192793369293, - "lr": 9.927147239263804e-06, - "objective/entropy": -2.9724502563476562, - "objective/kl": 17.076000213623047, - "objective/non_score_reward": -0.8538000583648682, - "objective/rlhf_reward": -1.8994284508549533, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 46.98078918457031, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.798828125, - "step": 114, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9999396800994873 - }, - { - "episode": 1856, - "epoch": 0.011120298139027693, - "loss/policy_avg": 0.27465301752090454, - "lr": 9.926508179959101e-06, - "objective/entropy": 40.056610107421875, - "objective/kl": 22.515907287597656, - "objective/non_score_reward": -1.1257953643798828, - "objective/rlhf_reward": -2.8413221291905506, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 81.93817138671875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.57421875, - "step": 115, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0007479190826416 - }, - { - "episode": 1872, - "epoch": 0.011216162778157242, - "loss/policy_avg": 0.3945024013519287, - "lr": 9.925869120654398e-06, - "objective/entropy": 69.15873718261719, - "objective/kl": 21.74050521850586, - "objective/non_score_reward": -1.0870254039764404, - "objective/rlhf_reward": -3.0225888824760148, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 38.46895980834961, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.59765625, - "step": 116, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0014419555664062 - }, - { - "episode": 1888, - "epoch": 0.01131202741728679, - "loss/policy_avg": 0.5689772367477417, - "lr": 9.925230061349695e-06, - "objective/entropy": 144.26678466796875, - "objective/kl": 14.530990600585938, - "objective/non_score_reward": -0.726549506187439, - "objective/rlhf_reward": -1.1728648702303568, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 1.715579628944397, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8203125, - "step": 117, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0058815479278564 - }, - { - "episode": 1904, - "epoch": 0.01140789205641634, - "loss/policy_avg": -0.025625256821513176, - "lr": 9.92459100204499e-06, - "objective/entropy": -91.6683120727539, - "objective/kl": 16.61312484741211, - "objective/non_score_reward": -0.8306561708450317, - "objective/rlhf_reward": -1.944022663918835, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 18.064186096191406, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4990234375, - "step": 118, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999868392944336 - }, - { - "episode": 1920, - "epoch": 0.011503756695545889, - "loss/policy_avg": 0.4135175943374634, - "lr": 9.923951942740287e-06, - "objective/entropy": 145.33905029296875, - "objective/kl": 18.559207916259766, - "objective/non_score_reward": -0.9279603958129883, - "objective/rlhf_reward": -1.5891353509583808, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 19.033662796020508, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66796875, - "step": 119, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9981472492218018 - }, - { - "episode": 1936, - "epoch": 0.011599621334675438, - "loss/policy_avg": 0.3322446942329407, - "lr": 9.923312883435584e-06, - "objective/entropy": 109.6761474609375, - "objective/kl": 18.231651306152344, - "objective/non_score_reward": -0.9115825891494751, - "objective/rlhf_reward": -1.2463304907083512, - "objective/scores": 0.6, - "policy/approxkl_avg": 108.51126098632812, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.65625, - "step": 120, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.996593952178955 - }, - { - "episode": 1952, - "epoch": 0.011695485973804987, - "loss/policy_avg": 0.22522342205047607, - "lr": 9.92267382413088e-06, - "objective/entropy": 95.46246337890625, - "objective/kl": 16.838998794555664, - "objective/non_score_reward": -0.841949999332428, - "objective/rlhf_reward": -1.8520282743298375, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 14.038084983825684, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8046875, - "step": 121, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.997774362564087 - }, - { - "episode": 1968, - "epoch": 0.011791350612934537, - "loss/policy_avg": 0.18379229307174683, - "lr": 9.922034764826178e-06, - "objective/entropy": 138.12388610839844, - "objective/kl": 25.93743324279785, - "objective/non_score_reward": -1.2968716621398926, - "objective/rlhf_reward": -3.828236812089367, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 26.206398010253906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.734375, - "step": 122, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0011606216430664 - }, - { - "episode": 1984, - "epoch": 0.011887215252064086, - "loss/policy_avg": 0.31653979420661926, - "lr": 9.921395705521473e-06, - "objective/entropy": -44.61676788330078, - "objective/kl": 21.166324615478516, - "objective/non_score_reward": -1.0583162307739258, - "objective/rlhf_reward": -2.9077520704566666, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 29.74887466430664, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.521484375, - "step": 123, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9996273517608643 - }, - { - "episode": 2000, - "epoch": 0.011983079891193635, - "loss/policy_avg": 0.1589316874742508, - "lr": 9.92075664621677e-06, - "objective/entropy": -77.4912109375, - "objective/kl": 20.79126739501953, - "objective/non_score_reward": -1.0395634174346924, - "objective/rlhf_reward": -2.4249199191729227, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 133.58343505859375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.66015625, - "step": 124, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9961724281311035 - }, - { - "episode": 2016, - "epoch": 0.012078944530323184, - "loss/policy_avg": 0.2586688995361328, - "lr": 9.920117586912067e-06, - "objective/entropy": 139.38818359375, - "objective/kl": 21.455245971679688, - "objective/non_score_reward": -1.072762370109558, - "objective/rlhf_reward": -2.775277876647648, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 47.609947204589844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8125, - "step": 125, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975237846374512 - }, - { - "episode": 2032, - "epoch": 0.012174809169452733, - "loss/policy_avg": 0.16066747903823853, - "lr": 9.919478527607362e-06, - "objective/entropy": 72.43231201171875, - "objective/kl": 20.59688377380371, - "objective/non_score_reward": -1.0298442840576172, - "objective/rlhf_reward": 0.28062304258346593, - "objective/scores": 1.1, - "policy/approxkl_avg": 75.74966430664062, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.529296875, - "step": 126, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998389720916748 - }, - { - "episode": 2048, - "epoch": 0.012270673808582282, - "loss/policy_avg": 0.07932023704051971, - "lr": 9.918839468302659e-06, - "objective/entropy": -12.7745361328125, - "objective/kl": 20.53061294555664, - "objective/non_score_reward": -1.0265307426452637, - "objective/rlhf_reward": -2.7275206232942164, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 19.110069274902344, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.55859375, - "step": 127, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9984248876571655 - }, - { - "episode": 2064, - "epoch": 0.012366538447711831, - "loss/policy_avg": 0.27331969141960144, - "lr": 9.918200408997956e-06, - "objective/entropy": 101.82013702392578, - "objective/kl": 18.18286895751953, - "objective/non_score_reward": -0.9091434478759766, - "objective/rlhf_reward": -2.2579716230310023, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 6.703115463256836, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.556640625, - "step": 128, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0009617805480957 - }, - { - "episode": 2080, - "epoch": 0.01246240308684138, - "loss/policy_avg": 0.4916057586669922, - "lr": 9.917561349693252e-06, - "objective/entropy": 88.1321029663086, - "objective/kl": 23.30657958984375, - "objective/non_score_reward": -1.165329098701477, - "objective/rlhf_reward": -3.3020663795217704, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 142.93795776367188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.611328125, - "step": 129, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9967398643493652 - }, - { - "episode": 2096, - "epoch": 0.012558267725970929, - "loss/policy_avg": 0.16071423888206482, - "lr": 9.91692229038855e-06, - "objective/entropy": 136.1899871826172, - "objective/kl": 15.380975723266602, - "objective/non_score_reward": -0.769048810005188, - "objective/rlhf_reward": -0.6761951804161073, - "objective/scores": 0.6, - "policy/approxkl_avg": 28.551767349243164, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.56640625, - "step": 130, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.03311824798584 - }, - { - "episode": 2112, - "epoch": 0.012654132365100478, - "loss/policy_avg": 0.0021135974675416946, - "lr": 9.916283231083844e-06, - "objective/entropy": -71.15084838867188, - "objective/kl": 18.961715698242188, - "objective/non_score_reward": -0.9480857849121094, - "objective/rlhf_reward": -2.1304838709241016, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.844127893447876, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4833984375, - "step": 131, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0009913444519043 - }, - { - "episode": 2128, - "epoch": 0.012749997004230027, - "loss/policy_avg": 0.042635850608348846, - "lr": 9.915644171779141e-06, - "objective/entropy": 20.673603057861328, - "objective/kl": 15.986173629760742, - "objective/non_score_reward": -0.7993086576461792, - "objective/rlhf_reward": -1.8555989473158414, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 36.049034118652344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.67578125, - "step": 132, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998380422592163 - }, - { - "episode": 2144, - "epoch": 0.012845861643359576, - "loss/policy_avg": 0.46513473987579346, - "lr": 9.915005112474438e-06, - "objective/entropy": 5.5274505615234375, - "objective/kl": 19.590290069580078, - "objective/non_score_reward": -0.979514479637146, - "objective/rlhf_reward": -2.5394558692849696, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 12.074180603027344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.642578125, - "step": 133, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0011277198791504 - }, - { - "episode": 2160, - "epoch": 0.012941726282489125, - "loss/policy_avg": 0.245748370885849, - "lr": 9.914366053169735e-06, - "objective/entropy": 65.60797119140625, - "objective/kl": 19.637710571289062, - "objective/non_score_reward": -0.9818854928016663, - "objective/rlhf_reward": -1.980130786971982, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 50.17578125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.791015625, - "step": 134, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9983665943145752 - }, - { - "episode": 2176, - "epoch": 0.013037590921618674, - "loss/policy_avg": 0.02180427499115467, - "lr": 9.913726993865032e-06, - "objective/entropy": 0.8936500549316406, - "objective/kl": 24.33076286315918, - "objective/non_score_reward": -1.2165381908416748, - "objective/rlhf_reward": -3.524517109900146, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 69.30375671386719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5009765625, - "step": 135, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.99745774269104 - }, - { - "episode": 2192, - "epoch": 0.013133455560748224, - "loss/policy_avg": 0.36717042326927185, - "lr": 9.913087934560329e-06, - "objective/entropy": 83.415283203125, - "objective/kl": 21.930896759033203, - "objective/non_score_reward": -1.0965447425842285, - "objective/rlhf_reward": -1.4624603136789527, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 79.15277862548828, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.546875, - "step": 136, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998199701309204 - }, - { - "episode": 2208, - "epoch": 0.013229320199877773, - "loss/policy_avg": 0.2460360825061798, - "lr": 9.912448875255624e-06, - "objective/entropy": 137.11976623535156, - "objective/kl": 21.218502044677734, - "objective/non_score_reward": -1.060925006866455, - "objective/rlhf_reward": -2.8198681666451373, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 67.851806640625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.666015625, - "step": 137, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9969704151153564 - }, - { - "episode": 2224, - "epoch": 0.013325184839007322, - "loss/policy_avg": 0.21244561672210693, - "lr": 9.911809815950921e-06, - "objective/entropy": 175.0180206298828, - "objective/kl": 16.889467239379883, - "objective/non_score_reward": -0.8444733619689941, - "objective/rlhf_reward": -1.4304821593331654, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 78.4537353515625, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.515625, - "step": 138, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9985227584838867 - }, - { - "episode": 2240, - "epoch": 0.013421049478136871, - "loss/policy_avg": 0.18417471647262573, - "lr": 9.911170756646218e-06, - "objective/entropy": 224.734619140625, - "objective/kl": 33.112342834472656, - "objective/non_score_reward": -1.6556169986724854, - "objective/rlhf_reward": -4.889135018984477, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 160.8165283203125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7109375, - "step": 139, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992772340774536 - }, - { - "episode": 2256, - "epoch": 0.01351691411726642, - "loss/policy_avg": 0.40639203786849976, - "lr": 9.910531697341515e-06, - "objective/entropy": 69.94343566894531, - "objective/kl": 24.266616821289062, - "objective/non_score_reward": -1.2133309841156006, - "objective/rlhf_reward": -3.40272543868576, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 126.5036392211914, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5625, - "step": 140, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999225378036499 - }, - { - "episode": 2272, - "epoch": 0.01361277875639597, - "loss/policy_avg": 0.28501349687576294, - "lr": 9.909892638036812e-06, - "objective/entropy": 61.523101806640625, - "objective/kl": 17.776689529418945, - "objective/non_score_reward": -0.8888344764709473, - "objective/rlhf_reward": -1.8220045725504557, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 87.0567398071289, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.537109375, - "step": 141, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000370740890503 - }, - { - "episode": 2288, - "epoch": 0.013708643395525518, - "loss/policy_avg": 0.30668091773986816, - "lr": 9.909253578732107e-06, - "objective/entropy": 227.46041870117188, - "objective/kl": 20.17832374572754, - "objective/non_score_reward": -1.0089161396026611, - "objective/rlhf_reward": -2.5198930142247047, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 50.498268127441406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.685546875, - "step": 142, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999216079711914 - }, - { - "episode": 2304, - "epoch": 0.013804508034655067, - "loss/policy_avg": 0.3348355293273926, - "lr": 9.908614519427404e-06, - "objective/entropy": 164.50863647460938, - "objective/kl": 13.646249771118164, - "objective/non_score_reward": -0.6823124885559082, - "objective/rlhf_reward": -1.1251298821607407, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 63.31299591064453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.95703125, - "step": 143, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9986834526062012 - }, - { - "episode": 2320, - "epoch": 0.013900372673784616, - "loss/policy_avg": 0.7517778277397156, - "lr": 9.9079754601227e-06, - "objective/entropy": -69.42684936523438, - "objective/kl": 13.007519721984863, - "objective/non_score_reward": -0.6503760814666748, - "objective/rlhf_reward": -0.2015041172504426, - "objective/scores": 0.6, - "policy/approxkl_avg": 15.501136779785156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.533203125, - "step": 144, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9969980716705322 - }, - { - "episode": 2336, - "epoch": 0.013996237312914165, - "loss/policy_avg": 0.1666509509086609, - "lr": 9.907336400817996e-06, - "objective/entropy": 175.3941192626953, - "objective/kl": 20.383106231689453, - "objective/non_score_reward": -1.0191553831100464, - "objective/rlhf_reward": -2.414762055099593, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 102.40309143066406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65625, - "step": 145, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9960074424743652 - }, - { - "episode": 2352, - "epoch": 0.014092101952043714, - "loss/policy_avg": 0.08111919462680817, - "lr": 9.906697341513293e-06, - "objective/entropy": 66.45804595947266, - "objective/kl": 20.63641357421875, - "objective/non_score_reward": -1.0318206548690796, - "objective/rlhf_reward": -2.7680326637968253, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 16.144962310791016, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.44921875, - "step": 146, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0003509521484375 - }, - { - "episode": 2368, - "epoch": 0.014187966591173263, - "loss/policy_avg": 0.2162848860025406, - "lr": 9.90605828220859e-06, - "objective/entropy": 66.34003448486328, - "objective/kl": 21.03724479675293, - "objective/non_score_reward": -1.051862359046936, - "objective/rlhf_reward": -1.8074494361877442, - "objective/scores": 0.6, - "policy/approxkl_avg": 56.59767150878906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.544921875, - "step": 147, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9967288970947266 - }, - { - "episode": 2384, - "epoch": 0.014283831230302812, - "loss/policy_avg": 0.13452857732772827, - "lr": 9.905419222903886e-06, - "objective/entropy": 160.91929626464844, - "objective/kl": 22.133365631103516, - "objective/non_score_reward": -1.10666823387146, - "objective/rlhf_reward": -2.693339631954829, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 64.49358367919922, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.62109375, - "step": 148, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988274574279785 - }, - { - "episode": 2400, - "epoch": 0.01437969586943236, - "loss/policy_avg": 1.6826289892196655, - "lr": 9.904780163599183e-06, - "objective/entropy": -182.28018188476562, - "objective/kl": 22.543842315673828, - "objective/non_score_reward": -1.1271920204162598, - "objective/rlhf_reward": -3.084936280449001, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 70.59880828857422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.62890625, - "step": 149, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0008223056793213 - }, - { - "episode": 2416, - "epoch": 0.01447556050856191, - "loss/policy_avg": 0.4059183597564697, - "lr": 9.904141104294478e-06, - "objective/entropy": 225.73135375976562, - "objective/kl": 23.115840911865234, - "objective/non_score_reward": -1.1557921171188354, - "objective/rlhf_reward": -2.8898351351420084, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 45.14168930053711, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.703125, - "step": 150, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9997532367706299 - }, - { - "episode": 2432, - "epoch": 0.01457142514769146, - "loss/policy_avg": 0.10681919753551483, - "lr": 9.903502044989775e-06, - "objective/entropy": 213.69598388671875, - "objective/kl": 26.178190231323242, - "objective/non_score_reward": -1.3089096546173096, - "objective/rlhf_reward": -3.894002726584106, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 92.52935791015625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69140625, - "step": 151, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9975218772888184 - }, - { - "episode": 2448, - "epoch": 0.01466728978682101, - "loss/policy_avg": -0.2853464186191559, - "lr": 9.902862985685072e-06, - "objective/entropy": 58.680572509765625, - "objective/kl": 17.81705665588379, - "objective/non_score_reward": -0.8908528089523315, - "objective/rlhf_reward": -0.6396921619188514, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 89.08941650390625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.669921875, - "step": 152, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0143747329711914 - }, - { - "episode": 2464, - "epoch": 0.014763154425950558, - "loss/policy_avg": 0.07825072109699249, - "lr": 9.902223926380369e-06, - "objective/entropy": 198.86288452148438, - "objective/kl": 28.436542510986328, - "objective/non_score_reward": -1.4218271970748901, - "objective/rlhf_reward": -2.7635896548044414, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 44.41461181640625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.59375, - "step": 153, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9994783401489258 - }, - { - "episode": 2480, - "epoch": 0.014859019065080107, - "loss/policy_avg": 0.27155977487564087, - "lr": 9.901584867075666e-06, - "objective/entropy": 89.04707336425781, - "objective/kl": 21.113758087158203, - "objective/non_score_reward": -1.0556879043579102, - "objective/rlhf_reward": -1.2990326031458106, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 58.70441818237305, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.623046875, - "step": 154, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9971623420715332 - }, - { - "episode": 2496, - "epoch": 0.014954883704209656, - "loss/policy_avg": 0.3080964982509613, - "lr": 9.900945807770961e-06, - "objective/entropy": 35.38983154296875, - "objective/kl": 21.02568817138672, - "objective/non_score_reward": -1.0512844324111938, - "objective/rlhf_reward": -2.7241851715401406, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 52.82551193237305, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.560546875, - "step": 155, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9960044622421265 - }, - { - "episode": 2512, - "epoch": 0.015050748343339205, - "loss/policy_avg": 4.562356472015381, - "lr": 9.900306748466258e-06, - "objective/entropy": 253.11752319335938, - "objective/kl": 22.01451301574707, - "objective/non_score_reward": -1.1007256507873535, - "objective/rlhf_reward": -2.798782501284199, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 74.26364135742188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.765625, - "step": 156, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9965643882751465 - }, - { - "episode": 2528, - "epoch": 0.015146612982468754, - "loss/policy_avg": 0.21197248995304108, - "lr": 9.899667689161555e-06, - "objective/entropy": 149.58770751953125, - "objective/kl": 23.317626953125, - "objective/non_score_reward": -1.1658812761306763, - "objective/rlhf_reward": -2.2635251045227047, - "objective/scores": 0.6, - "policy/approxkl_avg": 51.574981689453125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4736328125, - "step": 157, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.995574951171875 - }, - { - "episode": 2544, - "epoch": 0.015242477621598303, - "loss/policy_avg": 0.20880039036273956, - "lr": 9.899028629856852e-06, - "objective/entropy": -64.38532257080078, - "objective/kl": 25.92443084716797, - "objective/non_score_reward": -1.2962216138839722, - "objective/rlhf_reward": -3.784886217117309, - "objective/scores": 0.35, - "policy/approxkl_avg": 138.45706176757812, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.568359375, - "step": 158, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9968822002410889 - }, - { - "episode": 2560, - "epoch": 0.015338342260727852, - "loss/policy_avg": 0.21600359678268433, - "lr": 9.898389570552149e-06, - "objective/entropy": 3.545970916748047, - "objective/kl": 23.09051513671875, - "objective/non_score_reward": -1.1545257568359375, - "objective/rlhf_reward": -2.6706922007369354, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 36.885650634765625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.55859375, - "step": 159, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9993218183517456 - }, - { - "episode": 2576, - "epoch": 0.015434206899857401, - "loss/policy_avg": 0.5031390190124512, - "lr": 9.897750511247446e-06, - "objective/entropy": 98.00604248046875, - "objective/kl": 25.33047866821289, - "objective/non_score_reward": -1.2665239572525024, - "objective/rlhf_reward": -3.4619760847726635, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 83.63774871826172, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.671875, - "step": 160, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000274896621704 - }, - { - "episode": 2592, - "epoch": 0.01553007153898695, - "loss/policy_avg": 0.018053412437438965, - "lr": 9.89711145194274e-06, - "objective/entropy": 2.8434524536132812, - "objective/kl": 24.395084381103516, - "objective/non_score_reward": -1.2197542190551758, - "objective/rlhf_reward": -3.2171576074963673, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.6353378295898438, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.64453125, - "step": 161, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001478433609009 - }, - { - "episode": 2608, - "epoch": 0.0156259361781165, - "loss/policy_avg": 0.25576311349868774, - "lr": 9.896472392638038e-06, - "objective/entropy": -64.24278259277344, - "objective/kl": 16.287256240844727, - "objective/non_score_reward": -0.8143627643585205, - "objective/rlhf_reward": -1.5241178731123606, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 25.824050903320312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6953125, - "step": 162, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9984066486358643 - }, - { - "episode": 2624, - "epoch": 0.01572180081724605, - "loss/policy_avg": 0.2750253677368164, - "lr": 9.895833333333334e-06, - "objective/entropy": 170.5203857421875, - "objective/kl": 35.09113693237305, - "objective/non_score_reward": -1.7545567750930786, - "objective/rlhf_reward": -4.094508086086485, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 91.88323974609375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.76171875, - "step": 163, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9978190660476685 - }, - { - "episode": 2640, - "epoch": 0.0158176654563756, - "loss/policy_avg": 0.2685161828994751, - "lr": 9.895194274028631e-06, - "objective/entropy": 107.911376953125, - "objective/kl": 21.708637237548828, - "objective/non_score_reward": -1.0854318141937256, - "objective/rlhf_reward": -2.8911290570214834, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 48.546165466308594, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.603515625, - "step": 164, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9939230680465698 - }, - { - "episode": 2656, - "epoch": 0.015913530095505148, - "loss/policy_avg": 0.3802343010902405, - "lr": 9.894555214723928e-06, - "objective/entropy": 137.427978515625, - "objective/kl": 20.673809051513672, - "objective/non_score_reward": -1.0336904525756836, - "objective/rlhf_reward": -2.793125978022247, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 36.90850830078125, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.63671875, - "step": 165, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9987661838531494 - }, - { - "episode": 2672, - "epoch": 0.016009394734634697, - "loss/policy_avg": 0.0008638650178909302, - "lr": 9.893916155419225e-06, - "objective/entropy": 159.45681762695312, - "objective/kl": 20.339492797851562, - "objective/non_score_reward": -1.016974687576294, - "objective/rlhf_reward": -2.7086488542303275, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 6.459288597106934, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.515625, - "step": 166, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9977226257324219 - }, - { - "episode": 2688, - "epoch": 0.016105259373764245, - "loss/policy_avg": 0.3463206887245178, - "lr": 9.89327709611452e-06, - "objective/entropy": -75.2735824584961, - "objective/kl": 27.865215301513672, - "objective/non_score_reward": -1.3932607173919678, - "objective/rlhf_reward": -4.173042631149292, - "objective/scores": 0.35, - "policy/approxkl_avg": 139.90060424804688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.587890625, - "step": 167, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0016684532165527 - }, - { - "episode": 2704, - "epoch": 0.016201124012893794, - "loss/policy_avg": 0.07642253488302231, - "lr": 9.892638036809815e-06, - "objective/entropy": 38.99913787841797, - "objective/kl": 19.061498641967773, - "objective/non_score_reward": -0.9530749320983887, - "objective/rlhf_reward": -1.987470920356821, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 22.035629272460938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.484375, - "step": 168, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0013604164123535 - }, - { - "episode": 2720, - "epoch": 0.016296988652023343, - "loss/policy_avg": 0.2990867495536804, - "lr": 9.891998977505112e-06, - "objective/entropy": 199.7046661376953, - "objective/kl": 23.46067237854004, - "objective/non_score_reward": -1.1730337142944336, - "objective/rlhf_reward": -3.268302519519893, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 19.572267532348633, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6171875, - "step": 169, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.998270034790039 - }, - { - "episode": 2736, - "epoch": 0.016392853291152892, - "loss/policy_avg": 0.3040146231651306, - "lr": 9.89135991820041e-06, - "objective/entropy": 84.5781021118164, - "objective/kl": 24.218996047973633, - "objective/non_score_reward": -1.2109497785568237, - "objective/rlhf_reward": -2.896387885289128, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 91.4429931640625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.57421875, - "step": 170, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0007870197296143 - }, - { - "episode": 2752, - "epoch": 0.01648871793028244, - "loss/policy_avg": 0.24132516980171204, - "lr": 9.890720858895706e-06, - "objective/entropy": 25.26891326904297, - "objective/kl": 12.311616897583008, - "objective/non_score_reward": -0.6155807971954346, - "objective/rlhf_reward": -2.4623232781887054, - "objective/scores": 0.0, - "policy/approxkl_avg": 4.089572906494141, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015625, - "step": 171, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9984550476074219 - }, - { - "episode": 2768, - "epoch": 0.01658458256941199, - "loss/policy_avg": 0.07815683633089066, - "lr": 9.890081799591003e-06, - "objective/entropy": -2.7739601135253906, - "objective/kl": 20.480499267578125, - "objective/non_score_reward": -1.0240248441696167, - "objective/rlhf_reward": -2.6151468185738325, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 11.766371726989746, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.52734375, - "step": 172, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999391794204712 - }, - { - "episode": 2784, - "epoch": 0.01668044720854154, - "loss/policy_avg": 0.31003671884536743, - "lr": 9.8894427402863e-06, - "objective/entropy": -5.804538726806641, - "objective/kl": 23.551572799682617, - "objective/non_score_reward": -1.1775786876678467, - "objective/rlhf_reward": -3.2597167297319025, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 241.19540405273438, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.587890625, - "step": 173, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9990514516830444 - }, - { - "episode": 2800, - "epoch": 0.016776311847671088, - "loss/policy_avg": 0.027285143733024597, - "lr": 9.888803680981595e-06, - "objective/entropy": 91.14071655273438, - "objective/kl": 19.611085891723633, - "objective/non_score_reward": -0.9805543422698975, - "objective/rlhf_reward": -2.44126462471044, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 60.10600662231445, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.537109375, - "step": 174, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9972784519195557 - }, - { - "episode": 2816, - "epoch": 0.016872176486800637, - "loss/policy_avg": 0.2845172882080078, - "lr": 9.888164621676892e-06, - "objective/entropy": 30.190153121948242, - "objective/kl": 24.783939361572266, - "objective/non_score_reward": -1.239197015762329, - "objective/rlhf_reward": -3.578185775367123, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 76.30748748779297, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.443359375, - "step": 175, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994440078735352 - }, - { - "episode": 2832, - "epoch": 0.016968041125930186, - "loss/policy_avg": 0.5662503838539124, - "lr": 9.887525562372189e-06, - "objective/entropy": 60.807342529296875, - "objective/kl": 12.370782852172852, - "objective/non_score_reward": -0.6185390949249268, - "objective/rlhf_reward": -1.0503242506581225, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 14.155126571655273, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.525390625, - "step": 176, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9987331628799438 - }, - { - "episode": 2848, - "epoch": 0.017063905765059735, - "loss/policy_avg": 0.08586982637643814, - "lr": 9.886886503067486e-06, - "objective/entropy": 43.38105010986328, - "objective/kl": 24.246856689453125, - "objective/non_score_reward": -1.2123429775238037, - "objective/rlhf_reward": -3.470769503203732, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 141.50592041015625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.64453125, - "step": 177, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9969239234924316 - }, - { - "episode": 2864, - "epoch": 0.017159770404189284, - "loss/policy_avg": 0.26094895601272583, - "lr": 9.886247443762783e-06, - "objective/entropy": 54.85191345214844, - "objective/kl": 20.912307739257812, - "objective/non_score_reward": -1.0456154346466064, - "objective/rlhf_reward": -2.7824616193771363, - "objective/scores": 0.35, - "policy/approxkl_avg": 19.43996810913086, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4755859375, - "step": 178, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0007808208465576 - }, - { - "episode": 2880, - "epoch": 0.017255635043318833, - "loss/policy_avg": -0.0008885636925697327, - "lr": 9.88560838445808e-06, - "objective/entropy": 1.5364952087402344, - "objective/kl": 18.547964096069336, - "objective/non_score_reward": -0.9273982048034668, - "objective/rlhf_reward": -1.762181530671056, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 103.84625244140625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.52734375, - "step": 179, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0031652450561523 - }, - { - "episode": 2896, - "epoch": 0.017351499682448382, - "loss/policy_avg": 0.07095308601856232, - "lr": 9.884969325153375e-06, - "objective/entropy": -57.707908630371094, - "objective/kl": 17.486156463623047, - "objective/non_score_reward": -0.8743079304695129, - "objective/rlhf_reward": -1.3745254895844794, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 35.78956604003906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.63671875, - "step": 180, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9995075464248657 - }, - { - "episode": 2912, - "epoch": 0.01744736432157793, - "loss/policy_avg": 0.42247164249420166, - "lr": 9.884330265848671e-06, - "objective/entropy": 194.7113037109375, - "objective/kl": 21.53358268737793, - "objective/non_score_reward": -1.0766791105270386, - "objective/rlhf_reward": -2.750457256045893, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 58.89783477783203, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.720703125, - "step": 181, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.996431827545166 - }, - { - "episode": 2928, - "epoch": 0.01754322896070748, - "loss/policy_avg": 0.3189627528190613, - "lr": 9.883691206543968e-06, - "objective/entropy": 125.43355560302734, - "objective/kl": 20.729223251342773, - "objective/non_score_reward": -1.0364612340927124, - "objective/rlhf_reward": -2.767242708293301, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 31.974578857421875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.60546875, - "step": 182, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984747171401978 - }, - { - "episode": 2944, - "epoch": 0.01763909359983703, - "loss/policy_avg": 0.19416040182113647, - "lr": 9.883052147239265e-06, - "objective/entropy": 127.4957275390625, - "objective/kl": 23.107641220092773, - "objective/non_score_reward": -1.1553820371627808, - "objective/rlhf_reward": -3.2429258609689295, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 41.45734786987305, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.376953125, - "step": 183, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999535322189331 - }, - { - "episode": 2960, - "epoch": 0.017734958238966578, - "loss/policy_avg": 0.04916887357831001, - "lr": 9.882413087934562e-06, - "objective/entropy": -16.33904266357422, - "objective/kl": 15.624849319458008, - "objective/non_score_reward": -0.7812424898147583, - "objective/rlhf_reward": -1.002263667360816, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 86.75860595703125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8203125, - "step": 184, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9967164993286133 - }, - { - "episode": 2976, - "epoch": 0.017830822878096127, - "loss/policy_avg": 0.15854808688163757, - "lr": 9.881774028629857e-06, - "objective/entropy": -9.968147277832031, - "objective/kl": 20.46514320373535, - "objective/non_score_reward": -1.0232571363449097, - "objective/rlhf_reward": -2.35969527165095, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 16.395225524902344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5859375, - "step": 185, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9976359605789185 - }, - { - "episode": 2992, - "epoch": 0.017926687517225676, - "loss/policy_avg": 0.36498603224754333, - "lr": 9.881134969325154e-06, - "objective/entropy": 209.59991455078125, - "objective/kl": 18.690290451049805, - "objective/non_score_reward": -0.9345145225524902, - "objective/rlhf_reward": -2.338058030605316, - "objective/scores": 0.35, - "policy/approxkl_avg": 12.64120101928711, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.623046875, - "step": 186, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9994118213653564 - }, - { - "episode": 3008, - "epoch": 0.018022552156355228, - "loss/policy_avg": 0.15073028206825256, - "lr": 9.880495910020451e-06, - "objective/entropy": 33.50044250488281, - "objective/kl": 21.099205017089844, - "objective/non_score_reward": -1.0549602508544922, - "objective/rlhf_reward": 0.1801587581634525, - "objective/scores": 1.1, - "policy/approxkl_avg": 28.017484664916992, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.666015625, - "step": 187, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000263214111328 - }, - { - "episode": 3024, - "epoch": 0.018118416795484777, - "loss/policy_avg": 0.04914219304919243, - "lr": 9.879856850715748e-06, - "objective/entropy": 109.99685668945312, - "objective/kl": 23.795440673828125, - "objective/non_score_reward": -1.1897720098495483, - "objective/rlhf_reward": -0.3590880990028378, - "objective/scores": 1.1, - "policy/approxkl_avg": 17.797225952148438, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.529296875, - "step": 188, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0002386569976807 - }, - { - "episode": 3040, - "epoch": 0.018214281434614326, - "loss/policy_avg": 0.26782599091529846, - "lr": 9.879217791411043e-06, - "objective/entropy": 46.40031051635742, - "objective/kl": 15.295504570007324, - "objective/non_score_reward": -0.764775276184082, - "objective/rlhf_reward": -1.6998512086614799, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 19.033124923706055, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4287109375, - "step": 189, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0006394386291504 - }, - { - "episode": 3056, - "epoch": 0.018310146073743875, - "loss/policy_avg": -0.0003484562039375305, - "lr": 9.87857873210634e-06, - "objective/entropy": -128.13638305664062, - "objective/kl": 23.236797332763672, - "objective/non_score_reward": -1.1618399620056152, - "objective/rlhf_reward": -2.985500340879546, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 122.61852264404297, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.521484375, - "step": 190, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998673439025879 - }, - { - "episode": 3072, - "epoch": 0.018406010712873424, - "loss/policy_avg": 0.285878986120224, - "lr": 9.877939672801637e-06, - "objective/entropy": -155.79151916503906, - "objective/kl": 17.15728187561035, - "objective/non_score_reward": -0.8578640818595886, - "objective/rlhf_reward": -1.6981231282154718, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 27.024686813354492, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.578125, - "step": 191, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9977209568023682 - }, - { - "episode": 3088, - "epoch": 0.018501875352002973, - "loss/policy_avg": 0.03845605254173279, - "lr": 9.877300613496934e-06, - "objective/entropy": -79.23377227783203, - "objective/kl": 24.854154586791992, - "objective/non_score_reward": -1.2427077293395996, - "objective/rlhf_reward": -3.4145718505054266, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 108.08650970458984, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.54296875, - "step": 192, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9965288639068604 - }, - { - "episode": 3104, - "epoch": 0.018597739991132522, - "loss/policy_avg": 0.22054271399974823, - "lr": 9.876661554192229e-06, - "objective/entropy": 58.46562576293945, - "objective/kl": 18.69571876525879, - "objective/non_score_reward": -0.9347859621047974, - "objective/rlhf_reward": -1.3391437292099, - "objective/scores": 0.6, - "policy/approxkl_avg": 17.535587310791016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66796875, - "step": 193, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.996351957321167 - }, - { - "episode": 3120, - "epoch": 0.01869360463026207, - "loss/policy_avg": 0.46004775166511536, - "lr": 9.876022494887526e-06, - "objective/entropy": 208.6689453125, - "objective/kl": 24.537294387817383, - "objective/non_score_reward": -1.2268648147583008, - "objective/rlhf_reward": -3.3511998941570074, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 103.11289978027344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6171875, - "step": 194, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9980366230010986 - }, - { - "episode": 3136, - "epoch": 0.01878946926939162, - "loss/policy_avg": 0.14284425973892212, - "lr": 9.875383435582823e-06, - "objective/entropy": -140.25045776367188, - "objective/kl": 21.156387329101562, - "objective/non_score_reward": -1.0578192472457886, - "objective/rlhf_reward": -1.8312772423028945, - "objective/scores": 0.6, - "policy/approxkl_avg": 95.11038208007812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69921875, - "step": 195, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0021564960479736 - }, - { - "episode": 3152, - "epoch": 0.01888533390852117, - "loss/policy_avg": 0.4036502540111542, - "lr": 9.87474437627812e-06, - "objective/entropy": 97.97139739990234, - "objective/kl": 20.765098571777344, - "objective/non_score_reward": -1.038254737854004, - "objective/rlhf_reward": -1.7530193686485291, - "objective/scores": 0.6, - "policy/approxkl_avg": 33.61680603027344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.75, - "step": 196, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9960455894470215 - }, - { - "episode": 3168, - "epoch": 0.018981198547650718, - "loss/policy_avg": 0.03367016091942787, - "lr": 9.874105316973416e-06, - "objective/entropy": 110.7692642211914, - "objective/kl": 32.466636657714844, - "objective/non_score_reward": -1.6233320236206055, - "objective/rlhf_reward": -4.668499465259623, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 22.905399322509766, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66015625, - "step": 197, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000126361846924 - }, - { - "episode": 3184, - "epoch": 0.019077063186780267, - "loss/policy_avg": 0.3382406532764435, - "lr": 9.873466257668712e-06, - "objective/entropy": -46.87655258178711, - "objective/kl": 23.83783531188965, - "objective/non_score_reward": -1.1918917894363403, - "objective/rlhf_reward": -3.44205424550168, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 26.46108055114746, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4814453125, - "step": 198, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9974579811096191 - }, - { - "episode": 3200, - "epoch": 0.019172927825909816, - "loss/policy_avg": 0.05052588880062103, - "lr": 9.872827198364009e-06, - "objective/entropy": -62.79549789428711, - "objective/kl": 19.587276458740234, - "objective/non_score_reward": -0.9793638586997986, - "objective/rlhf_reward": -0.9937364205133643, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 37.62165069580078, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.564453125, - "step": 199, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9972220659255981 - }, - { - "episode": 3216, - "epoch": 0.019268792465039365, - "loss/policy_avg": 0.2230260968208313, - "lr": 9.872188139059305e-06, - "objective/entropy": -37.75834655761719, - "objective/kl": 23.102069854736328, - "objective/non_score_reward": -1.1551035642623901, - "objective/rlhf_reward": -3.2787786035830075, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 56.49012756347656, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.583984375, - "step": 200, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000248908996582 - }, - { - "episode": 3232, - "epoch": 0.019364657104168913, - "loss/policy_avg": 0.4118785858154297, - "lr": 9.871549079754602e-06, - "objective/entropy": 85.49769592285156, - "objective/kl": 25.69809913635254, - "objective/non_score_reward": -1.284904956817627, - "objective/rlhf_reward": -3.5833605816036016, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 56.752174377441406, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.66015625, - "step": 201, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987632036209106 - }, - { - "episode": 3248, - "epoch": 0.019460521743298462, - "loss/policy_avg": 0.06031988561153412, - "lr": 9.8709100204499e-06, - "objective/entropy": 16.456554412841797, - "objective/kl": 25.35955047607422, - "objective/non_score_reward": -1.2679774761199951, - "objective/rlhf_reward": -3.6213118239358515, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 21.745624542236328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.529296875, - "step": 202, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9980382919311523 - }, - { - "episode": 3264, - "epoch": 0.01955638638242801, - "loss/policy_avg": 0.06312263011932373, - "lr": 9.870270961145196e-06, - "objective/entropy": 132.99948120117188, - "objective/kl": 22.432659149169922, - "objective/non_score_reward": -1.1216330528259277, - "objective/rlhf_reward": -2.8246725253468616, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 93.43849182128906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.568359375, - "step": 203, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9995737075805664 - }, - { - "episode": 3280, - "epoch": 0.01965225102155756, - "loss/policy_avg": 0.6064414978027344, - "lr": 9.869631901840491e-06, - "objective/entropy": -19.207683563232422, - "objective/kl": 18.83993148803711, - "objective/non_score_reward": -0.9419965744018555, - "objective/rlhf_reward": -2.3173880978540033, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 90.60572052001953, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4931640625, - "step": 204, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000382661819458 - }, - { - "episode": 3296, - "epoch": 0.01974811566068711, - "loss/policy_avg": 0.2940763831138611, - "lr": 9.868992842535788e-06, - "objective/entropy": 83.77371978759766, - "objective/kl": 25.884700775146484, - "objective/non_score_reward": -1.2942349910736084, - "objective/rlhf_reward": -3.3521112903681507, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 39.873409271240234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.447265625, - "step": 205, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9972522258758545 - }, - { - "episode": 3312, - "epoch": 0.019843980299816658, - "loss/policy_avg": 0.18257562816143036, - "lr": 9.868353783231085e-06, - "objective/entropy": 119.6646728515625, - "objective/kl": 27.568458557128906, - "objective/non_score_reward": -1.3784228563308716, - "objective/rlhf_reward": -1.1136915445327755, - "objective/scores": 1.1, - "policy/approxkl_avg": 48.24208068847656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.66796875, - "step": 206, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987564086914062 - }, - { - "episode": 3328, - "epoch": 0.019939844938946207, - "loss/policy_avg": -0.011964879930019379, - "lr": 9.867714723926382e-06, - "objective/entropy": 79.78416442871094, - "objective/kl": 24.409799575805664, - "objective/non_score_reward": -1.2204899787902832, - "objective/rlhf_reward": -3.5033578658975184, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 19.269145965576172, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4658203125, - "step": 207, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000608205795288 - }, - { - "episode": 3344, - "epoch": 0.020035709578075756, - "loss/policy_avg": 0.04908262565732002, - "lr": 9.867075664621679e-06, - "objective/entropy": 174.413818359375, - "objective/kl": 24.83539581298828, - "objective/non_score_reward": -1.241769790649414, - "objective/rlhf_reward": -3.3629594779649548, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 14.995980262756348, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.54296875, - "step": 208, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9985637664794922 - }, - { - "episode": 3360, - "epoch": 0.020131574217205305, - "loss/policy_avg": 0.14710021018981934, - "lr": 9.866436605316974e-06, - "objective/entropy": 132.51194763183594, - "objective/kl": 29.743432998657227, - "objective/non_score_reward": -1.4871716499328613, - "objective/rlhf_reward": -4.344566795889454, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 65.08041381835938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.490234375, - "step": 209, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0021185874938965 - }, - { - "episode": 3376, - "epoch": 0.020227438856334854, - "loss/policy_avg": 0.0796532854437828, - "lr": 9.86579754601227e-06, - "objective/entropy": 1.3461151123046875, - "objective/kl": 26.279298782348633, - "objective/non_score_reward": -1.313965082168579, - "objective/rlhf_reward": -0.8558599710464474, - "objective/scores": 1.1, - "policy/approxkl_avg": 105.49284362792969, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5859375, - "step": 210, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989545345306396 - }, - { - "episode": 3392, - "epoch": 0.020323303495464403, - "loss/policy_avg": -0.03664415329694748, - "lr": 9.865158486707568e-06, - "objective/entropy": -37.266082763671875, - "objective/kl": 19.48423957824707, - "objective/non_score_reward": -0.9742119908332825, - "objective/rlhf_reward": -0.9731288298380103, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 8.304027557373047, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.638671875, - "step": 211, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.003216028213501 - }, - { - "episode": 3408, - "epoch": 0.020419168134593952, - "loss/policy_avg": 0.30985838174819946, - "lr": 9.864519427402863e-06, - "objective/entropy": 94.80859375, - "objective/kl": 29.94342041015625, - "objective/non_score_reward": -1.4971709251403809, - "objective/rlhf_reward": -4.564851482112971, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 115.7642593383789, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.74609375, - "step": 212, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9959279298782349 - }, - { - "episode": 3424, - "epoch": 0.0205150327737235, - "loss/policy_avg": 0.23234406113624573, - "lr": 9.86388036809816e-06, - "objective/entropy": 125.32878875732422, - "objective/kl": 33.22450637817383, - "objective/non_score_reward": -1.6612253189086914, - "objective/rlhf_reward": -4.820072407993387, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 82.43852233886719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58203125, - "step": 213, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001713752746582 - }, - { - "episode": 3440, - "epoch": 0.02061089741285305, - "loss/policy_avg": 1.5097947120666504, - "lr": 9.863241308793457e-06, - "objective/entropy": 132.66845703125, - "objective/kl": 27.622318267822266, - "objective/non_score_reward": -1.3811159133911133, - "objective/rlhf_reward": -3.6996345475044956, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 26.179336547851562, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.708984375, - "step": 214, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9993813037872314 - }, - { - "episode": 3456, - "epoch": 0.0207067620519826, - "loss/policy_avg": 0.12209601700305939, - "lr": 9.862602249488753e-06, - "objective/entropy": 132.88406372070312, - "objective/kl": 26.24971580505371, - "objective/non_score_reward": -1.312485694885254, - "objective/rlhf_reward": -5.249942898750305, - "objective/scores": 0.0, - "policy/approxkl_avg": 41.524139404296875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7109375, - "step": 215, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9990661144256592 - }, - { - "episode": 3472, - "epoch": 0.02080262669111215, - "loss/policy_avg": 0.3654727339744568, - "lr": 9.86196319018405e-06, - "objective/entropy": 39.344974517822266, - "objective/kl": 23.619754791259766, - "objective/non_score_reward": -1.18098783493042, - "objective/rlhf_reward": -1.8002320870172706, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 5.19040584564209, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4951171875, - "step": 216, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9990694522857666 - }, - { - "episode": 3488, - "epoch": 0.0208984913302417, - "loss/policy_avg": 0.05907230079174042, - "lr": 9.861324130879346e-06, - "objective/entropy": -49.055564880371094, - "objective/kl": 27.70423126220703, - "objective/non_score_reward": -1.3852115869522095, - "objective/rlhf_reward": -3.8789869598752125, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 62.16511917114258, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.609375, - "step": 217, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973804950714111 - }, - { - "episode": 3504, - "epoch": 0.02099435596937125, - "loss/policy_avg": 0.5758800506591797, - "lr": 9.860685071574642e-06, - "objective/entropy": 18.1787166595459, - "objective/kl": 25.688358306884766, - "objective/non_score_reward": -1.2844178676605225, - "objective/rlhf_reward": -2.2139523147952285, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 23.39984130859375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.498046875, - "step": 218, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9974064826965332 - }, - { - "episode": 3520, - "epoch": 0.021090220608500798, - "loss/policy_avg": 0.2610527575016022, - "lr": 9.86004601226994e-06, - "objective/entropy": -68.09791564941406, - "objective/kl": 26.7615966796875, - "objective/non_score_reward": -1.3380796909332275, - "objective/rlhf_reward": -4.026806149512453, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 124.13450622558594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4599609375, - "step": 219, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9986741542816162 - }, - { - "episode": 3536, - "epoch": 0.021186085247630347, - "loss/policy_avg": 0.1624567210674286, - "lr": 9.859406952965236e-06, - "objective/entropy": -113.99856567382812, - "objective/kl": 19.689868927001953, - "objective/non_score_reward": -0.9844935536384583, - "objective/rlhf_reward": -2.113145466121744, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 45.295875549316406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70703125, - "step": 220, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0004196166992188 - }, - { - "episode": 3552, - "epoch": 0.021281949886759896, - "loss/policy_avg": 0.13548433780670166, - "lr": 9.858767893660533e-06, - "objective/entropy": 154.66708374023438, - "objective/kl": 31.08365249633789, - "objective/non_score_reward": -1.554182529449463, - "objective/rlhf_reward": -4.554870968282805, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 43.560997009277344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7265625, - "step": 221, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9972419738769531 - }, - { - "episode": 3568, - "epoch": 0.021377814525889445, - "loss/policy_avg": 0.04025420919060707, - "lr": 9.858128834355828e-06, - "objective/entropy": 145.02468872070312, - "objective/kl": 31.459678649902344, - "objective/non_score_reward": -1.572983980178833, - "objective/rlhf_reward": -4.932686292861385, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 41.05935287475586, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4560546875, - "step": 222, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0009706020355225 - }, - { - "episode": 3584, - "epoch": 0.021473679165018994, - "loss/policy_avg": 1.5885295867919922, - "lr": 9.857489775051125e-06, - "objective/entropy": 141.5781707763672, - "objective/kl": 34.53314971923828, - "objective/non_score_reward": -1.726657509803772, - "objective/rlhf_reward": -5.244770532072174, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 37.03607177734375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.658203125, - "step": 223, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.992570400238037 - }, - { - "episode": 3600, - "epoch": 0.021569543804148543, - "loss/policy_avg": 0.9811650514602661, - "lr": 9.856850715746422e-06, - "objective/entropy": -30.946441650390625, - "objective/kl": 29.145998001098633, - "objective/non_score_reward": -1.4572999477386475, - "objective/rlhf_reward": -4.450597622481686, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 19.481060028076172, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.462890625, - "step": 224, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983983039855957 - }, - { - "episode": 3616, - "epoch": 0.021665408443278092, - "loss/policy_avg": 0.5196128487586975, - "lr": 9.856211656441719e-06, - "objective/entropy": -16.55962371826172, - "objective/kl": 28.4706974029541, - "objective/non_score_reward": -1.423534870147705, - "objective/rlhf_reward": -3.5714332482972484, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 117.12289428710938, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.732421875, - "step": 225, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9975980520248413 - }, - { - "episode": 3632, - "epoch": 0.02176127308240764, - "loss/policy_avg": 0.6528609395027161, - "lr": 9.855572597137016e-06, - "objective/entropy": 136.64077758789062, - "objective/kl": 32.46646499633789, - "objective/non_score_reward": -1.6233232021331787, - "objective/rlhf_reward": -2.093292927742004, - "objective/scores": 1.1, - "policy/approxkl_avg": 44.35145950317383, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.669921875, - "step": 226, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994146823883057 - }, - { - "episode": 3648, - "epoch": 0.02185713772153719, - "loss/policy_avg": 0.9434906244277954, - "lr": 9.854933537832313e-06, - "objective/entropy": -36.75615310668945, - "objective/kl": 31.890575408935547, - "objective/non_score_reward": -1.5945286750793457, - "objective/rlhf_reward": -5.052601966887636, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 65.19577026367188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.59375, - "step": 227, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9979374408721924 - }, - { - "episode": 3664, - "epoch": 0.02195300236066674, - "loss/policy_avg": 0.36130765080451965, - "lr": 9.854294478527608e-06, - "objective/entropy": 47.61101531982422, - "objective/kl": 18.669593811035156, - "objective/non_score_reward": -0.9334796071052551, - "objective/rlhf_reward": -2.3339184284210206, - "objective/scores": 0.35, - "policy/approxkl_avg": 15.266149520874023, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.578125, - "step": 228, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9959979057312012 - }, - { - "episode": 3680, - "epoch": 0.022048866999796288, - "loss/policy_avg": 0.18321090936660767, - "lr": 9.853655419222905e-06, - "objective/entropy": 116.60293579101562, - "objective/kl": 27.56112289428711, - "objective/non_score_reward": -1.378056287765503, - "objective/rlhf_reward": -3.5648136837052657, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 29.471284866333008, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.537109375, - "step": 229, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9991540908813477 - }, - { - "episode": 3696, - "epoch": 0.022144731638925837, - "loss/policy_avg": -0.044996485114097595, - "lr": 9.853016359918202e-06, - "objective/entropy": 38.275238037109375, - "objective/kl": 28.720836639404297, - "objective/non_score_reward": -1.4360418319702148, - "objective/rlhf_reward": -4.187907754388407, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 173.6102752685547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.552734375, - "step": 230, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997882604598999 - }, - { - "episode": 3712, - "epoch": 0.022240596278055386, - "loss/policy_avg": 0.027855467051267624, - "lr": 9.852377300613498e-06, - "objective/entropy": 123.59611511230469, - "objective/kl": 30.175601959228516, - "objective/non_score_reward": -1.5087801218032837, - "objective/rlhf_reward": -4.478861062732294, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 50.733642578125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.37109375, - "step": 231, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0003466606140137 - }, - { - "episode": 3728, - "epoch": 0.022336460917184935, - "loss/policy_avg": -0.3093503713607788, - "lr": 9.851738241308795e-06, - "objective/entropy": 0.438995361328125, - "objective/kl": 27.025171279907227, - "objective/non_score_reward": -1.3512585163116455, - "objective/rlhf_reward": -5.405034303665161, - "objective/scores": 0.0, - "policy/approxkl_avg": 13.092641830444336, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.615234375, - "step": 232, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000675678253174 - }, - { - "episode": 3744, - "epoch": 0.022432325556314484, - "loss/policy_avg": -0.05236402899026871, - "lr": 9.85109918200409e-06, - "objective/entropy": 112.74819946289062, - "objective/kl": 24.94538688659668, - "objective/non_score_reward": -1.2472693920135498, - "objective/rlhf_reward": -3.473305845054325, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 19.200075149536133, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.3583984375, - "step": 233, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.002117395401001 - }, - { - "episode": 3760, - "epoch": 0.022528190195444033, - "loss/policy_avg": 0.21103611588478088, - "lr": 9.850460122699387e-06, - "objective/entropy": 73.77043151855469, - "objective/kl": 28.00216293334961, - "objective/non_score_reward": -1.4001080989837646, - "objective/rlhf_reward": -3.6530211669968917, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 13.291183471679688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5009765625, - "step": 234, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9995331764221191 - }, - { - "episode": 3776, - "epoch": 0.02262405483457358, - "loss/policy_avg": 0.6418443918228149, - "lr": 9.849821063394683e-06, - "objective/entropy": 19.92426300048828, - "objective/kl": 31.282997131347656, - "objective/non_score_reward": -1.5641499757766724, - "objective/rlhf_reward": -4.931086901456041, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 98.59768676757812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.34375, - "step": 235, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0005524158477783 - }, - { - "episode": 3792, - "epoch": 0.02271991947370313, - "loss/policy_avg": 0.20836295187473297, - "lr": 9.84918200408998e-06, - "objective/entropy": 28.238201141357422, - "objective/kl": 29.105060577392578, - "objective/non_score_reward": -1.455253005027771, - "objective/rlhf_reward": -4.264752714839533, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 34.374176025390625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.51171875, - "step": 236, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989919662475586 - }, - { - "episode": 3808, - "epoch": 0.02281578411283268, - "loss/policy_avg": 0.43571943044662476, - "lr": 9.848542944785276e-06, - "objective/entropy": 144.94302368164062, - "objective/kl": 33.369178771972656, - "objective/non_score_reward": -1.6684589385986328, - "objective/rlhf_reward": -5.314585768912716, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 113.68771362304688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.607421875, - "step": 237, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.996192216873169 - }, - { - "episode": 3824, - "epoch": 0.02291164875196223, - "loss/policy_avg": 0.14893671870231628, - "lr": 9.847903885480573e-06, - "objective/entropy": 186.38681030273438, - "objective/kl": 41.077842712402344, - "objective/non_score_reward": -2.0538923740386963, - "objective/rlhf_reward": -6.611449215475636, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 168.3666229248047, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.755859375, - "step": 238, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984937906265259 - }, - { - "episode": 3840, - "epoch": 0.023007513391091777, - "loss/policy_avg": 0.07648584991693497, - "lr": 9.84726482617587e-06, - "objective/entropy": -37.23631286621094, - "objective/kl": 25.318248748779297, - "objective/non_score_reward": -1.2659125328063965, - "objective/rlhf_reward": -3.5073907067447454, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 50.266414642333984, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.48828125, - "step": 239, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9979995489120483 - }, - { - "episode": 3856, - "epoch": 0.023103378030221326, - "loss/policy_avg": -0.15926438570022583, - "lr": 9.846625766871167e-06, - "objective/entropy": 37.868736267089844, - "objective/kl": 27.493305206298828, - "objective/non_score_reward": -1.3746652603149414, - "objective/rlhf_reward": -4.173148546248598, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 6.63505220413208, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5390625, - "step": 240, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0081896781921387 - }, - { - "episode": 3872, - "epoch": 0.023199242669350875, - "loss/policy_avg": 0.14562831819057465, - "lr": 9.845986707566462e-06, - "objective/entropy": 15.188220977783203, - "objective/kl": 28.046958923339844, - "objective/non_score_reward": -1.4023480415344238, - "objective/rlhf_reward": -4.1587937875703425, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 43.238990783691406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.578125, - "step": 241, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.996706247329712 - }, - { - "episode": 3888, - "epoch": 0.023295107308480424, - "loss/policy_avg": 0.11054911464452744, - "lr": 9.845347648261759e-06, - "objective/entropy": 65.03858947753906, - "objective/kl": 30.087387084960938, - "objective/non_score_reward": -1.5043694972991943, - "objective/rlhf_reward": -4.070066402630742, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 3.83949613571167, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.587890625, - "step": 242, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9988956451416016 - }, - { - "episode": 3904, - "epoch": 0.023390971947609973, - "loss/policy_avg": 0.3941475749015808, - "lr": 9.844708588957056e-06, - "objective/entropy": 59.93316650390625, - "objective/kl": 25.623512268066406, - "objective/non_score_reward": -1.2811756134033203, - "objective/rlhf_reward": -3.52058264977129, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 78.30380249023438, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5859375, - "step": 243, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9990283250808716 - }, - { - "episode": 3920, - "epoch": 0.023486836586739522, - "loss/policy_avg": 0.19095474481582642, - "lr": 9.844069529652353e-06, - "objective/entropy": 31.422988891601562, - "objective/kl": 24.865825653076172, - "objective/non_score_reward": -1.2432913780212402, - "objective/rlhf_reward": -3.2398319403330484, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 38.12981033325195, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.53125, - "step": 244, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.004727840423584 - }, - { - "episode": 3936, - "epoch": 0.023582701225869074, - "loss/policy_avg": 0.049357250332832336, - "lr": 9.84343047034765e-06, - "objective/entropy": 21.297576904296875, - "objective/kl": 35.60150146484375, - "objective/non_score_reward": -1.7800750732421875, - "objective/rlhf_reward": -5.720300531387329, - "objective/scores": 0.35, - "policy/approxkl_avg": 38.869449615478516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4716796875, - "step": 245, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0019991397857666 - }, - { - "episode": 3952, - "epoch": 0.023678565864998623, - "loss/policy_avg": 0.7713517546653748, - "lr": 9.842791411042945e-06, - "objective/entropy": 53.62720489501953, - "objective/kl": 31.218942642211914, - "objective/non_score_reward": -1.5609471797943115, - "objective/rlhf_reward": -4.296377490239079, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 48.73869323730469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7265625, - "step": 246, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975836277008057 - }, - { - "episode": 3968, - "epoch": 0.023774430504128172, - "loss/policy_avg": 0.008143262937664986, - "lr": 9.842152351738242e-06, - "objective/entropy": 171.02789306640625, - "objective/kl": 34.79176330566406, - "objective/non_score_reward": -1.7395880222320557, - "objective/rlhf_reward": -5.296492939413176, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 21.7828369140625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.57421875, - "step": 247, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9989376068115234 - }, - { - "episode": 3984, - "epoch": 0.02387029514325772, - "loss/policy_avg": -0.12264247238636017, - "lr": 9.841513292433539e-06, - "objective/entropy": 80.24577331542969, - "objective/kl": 33.11949920654297, - "objective/non_score_reward": -1.6559748649597168, - "objective/rlhf_reward": -4.799071069034646, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 61.87395477294922, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4599609375, - "step": 248, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.003005027770996 - }, - { - "episode": 4000, - "epoch": 0.02396615978238727, - "loss/policy_avg": 0.2658330202102661, - "lr": 9.840874233128836e-06, - "objective/entropy": 149.58941650390625, - "objective/kl": 29.3863525390625, - "objective/non_score_reward": -1.4693175554275513, - "objective/rlhf_reward": -4.273150358263569, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 58.66055679321289, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.51171875, - "step": 249, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9972658157348633 - }, - { - "episode": 4016, - "epoch": 0.02406202442151682, - "loss/policy_avg": 0.09115779399871826, - "lr": 9.840235173824132e-06, - "objective/entropy": 147.28927612304688, - "objective/kl": 31.492679595947266, - "objective/non_score_reward": -1.5746338367462158, - "objective/rlhf_reward": -4.939285838340206, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 28.799278259277344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.796875, - "step": 250, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.002530574798584 - }, - { - "episode": 4032, - "epoch": 0.024157889060646368, - "loss/policy_avg": 0.09398385882377625, - "lr": 9.83959611451943e-06, - "objective/entropy": -45.248435974121094, - "objective/kl": 28.402175903320312, - "objective/non_score_reward": -1.4201087951660156, - "objective/rlhf_reward": -4.018575882137405, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 19.838550567626953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.517578125, - "step": 251, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9973764419555664 - }, - { - "episode": 4048, - "epoch": 0.024253753699775917, - "loss/policy_avg": 0.19270983338356018, - "lr": 9.838957055214724e-06, - "objective/entropy": 77.1705093383789, - "objective/kl": 34.050987243652344, - "objective/non_score_reward": -1.7025493383407593, - "objective/rlhf_reward": -5.076863960425058, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 18.725093841552734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4814453125, - "step": 252, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001063823699951 - }, - { - "episode": 4064, - "epoch": 0.024349618338905466, - "loss/policy_avg": 0.4652649164199829, - "lr": 9.838317995910021e-06, - "objective/entropy": 257.7345886230469, - "objective/kl": 24.133747100830078, - "objective/non_score_reward": -1.2066874504089355, - "objective/rlhf_reward": -3.4481475735581935, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 41.46368408203125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.80078125, - "step": 253, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9955878257751465 - }, - { - "episode": 4080, - "epoch": 0.024445482978035015, - "loss/policy_avg": 0.14692571759223938, - "lr": 9.837678936605318e-06, - "objective/entropy": 43.00188064575195, - "objective/kl": 24.73518180847168, - "objective/non_score_reward": -1.236759066581726, - "objective/rlhf_reward": -3.568433978644711, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 75.05264282226562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7421875, - "step": 254, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9990639686584473 - }, - { - "episode": 4096, - "epoch": 0.024541347617164564, - "loss/policy_avg": 0.08271847665309906, - "lr": 9.837039877300615e-06, - "objective/entropy": -79.57066345214844, - "objective/kl": 26.90784454345703, - "objective/non_score_reward": -1.3453922271728516, - "objective/rlhf_reward": -3.648235575358073, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 24.23294448852539, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.619140625, - "step": 255, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984873533248901 - }, - { - "episode": 4112, - "epoch": 0.024637212256294113, - "loss/policy_avg": 0.12403183430433273, - "lr": 9.83640081799591e-06, - "objective/entropy": 87.87326049804688, - "objective/kl": 29.708419799804688, - "objective/non_score_reward": -1.4854209423065186, - "objective/rlhf_reward": -4.116855438026499, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 32.65428161621094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.62890625, - "step": 256, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9981478452682495 - }, - { - "episode": 4128, - "epoch": 0.024733076895423662, - "loss/policy_avg": -0.17764857411384583, - "lr": 9.835761758691207e-06, - "objective/entropy": 130.6345977783203, - "objective/kl": 34.35237121582031, - "objective/non_score_reward": -1.717618465423584, - "objective/rlhf_reward": -5.314214794841364, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 118.99533081054688, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.3974609375, - "step": 257, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.014057159423828 - }, - { - "episode": 4144, - "epoch": 0.02482894153455321, - "loss/policy_avg": 2.400163173675537, - "lr": 9.835122699386504e-06, - "objective/entropy": 123.72301483154297, - "objective/kl": 21.25601577758789, - "objective/non_score_reward": -1.0628007650375366, - "objective/rlhf_reward": 0.1487968802452091, - "objective/scores": 1.1, - "policy/approxkl_avg": 36.07887268066406, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.572265625, - "step": 258, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998108148574829 - }, - { - "episode": 4160, - "epoch": 0.02492480617368276, - "loss/policy_avg": 0.3900964856147766, - "lr": 9.8344836400818e-06, - "objective/entropy": 233.3748321533203, - "objective/kl": 42.447425842285156, - "objective/non_score_reward": -2.1223714351654053, - "objective/rlhf_reward": -5.5657667263757915, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 19.722026824951172, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.74609375, - "step": 259, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000584363937378 - }, - { - "episode": 4176, - "epoch": 0.02502067081281231, - "loss/policy_avg": 0.3361247181892395, - "lr": 9.833844580777096e-06, - "objective/entropy": 135.13961791992188, - "objective/kl": 31.25783920288086, - "objective/non_score_reward": -1.5628920793533325, - "objective/rlhf_reward": -4.426739449771952, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 16.49414825439453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.486328125, - "step": 260, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9986810684204102 - }, - { - "episode": 4192, - "epoch": 0.025116535451941858, - "loss/policy_avg": 0.1438344419002533, - "lr": 9.833205521472393e-06, - "objective/entropy": 104.18168640136719, - "objective/kl": 35.72525405883789, - "objective/non_score_reward": -1.7862627506256104, - "objective/rlhf_reward": -5.320222015651773, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 22.100770950317383, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.65625, - "step": 261, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9996044635772705 - }, - { - "episode": 4208, - "epoch": 0.025212400091071407, - "loss/policy_avg": 2.402132034301758, - "lr": 9.83256646216769e-06, - "objective/entropy": 91.16908264160156, - "objective/kl": 29.633235931396484, - "objective/non_score_reward": -1.4816619157791138, - "objective/rlhf_reward": -4.476049522967681, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 43.586891174316406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.64453125, - "step": 262, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.004427433013916 - }, - { - "episode": 4224, - "epoch": 0.025308264730200956, - "loss/policy_avg": 0.7259080410003662, - "lr": 9.831927402862987e-06, - "objective/entropy": 154.68115234375, - "objective/kl": 37.00696563720703, - "objective/non_score_reward": -1.8503483533859253, - "objective/rlhf_reward": -5.576564307483743, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 16.052043914794922, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.484375, - "step": 263, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9974215030670166 - }, - { - "episode": 4240, - "epoch": 0.025404129369330505, - "loss/policy_avg": 0.09373458474874496, - "lr": 9.831288343558284e-06, - "objective/entropy": 72.85606384277344, - "objective/kl": 27.522302627563477, - "objective/non_score_reward": -1.376115083694458, - "objective/rlhf_reward": -3.679631943973612, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 142.1138916015625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.310546875, - "step": 264, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9991629123687744 - }, - { - "episode": 4256, - "epoch": 0.025499994008460054, - "loss/policy_avg": 0.7555310130119324, - "lr": 9.830649284253579e-06, - "objective/entropy": 72.61222076416016, - "objective/kl": 30.647029876708984, - "objective/non_score_reward": -1.5323514938354492, - "objective/rlhf_reward": -4.705573756893244, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 54.394874572753906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.587890625, - "step": 265, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0006964206695557 - }, - { - "episode": 4272, - "epoch": 0.025595858647589603, - "loss/policy_avg": 0.6551899313926697, - "lr": 9.830010224948876e-06, - "objective/entropy": 121.19924926757812, - "objective/kl": 33.96527099609375, - "objective/non_score_reward": -1.6982636451721191, - "objective/rlhf_reward": -5.131195192754852, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 40.39656066894531, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.470703125, - "step": 266, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999192476272583 - }, - { - "episode": 4288, - "epoch": 0.02569172328671915, - "loss/policy_avg": 1.1016074419021606, - "lr": 9.829371165644173e-06, - "objective/entropy": 132.00601196289062, - "objective/kl": 43.09049987792969, - "objective/non_score_reward": -2.154524803161621, - "objective/rlhf_reward": -7.102327191623386, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 126.27546691894531, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.3935546875, - "step": 267, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9990718364715576 - }, - { - "episode": 4304, - "epoch": 0.0257875879258487, - "loss/policy_avg": 0.08981708437204361, - "lr": 9.82873210633947e-06, - "objective/entropy": 140.80239868164062, - "objective/kl": 26.626178741455078, - "objective/non_score_reward": -1.3313090801239014, - "objective/rlhf_reward": -0.9252360224723812, - "objective/scores": 1.1, - "policy/approxkl_avg": 84.53665924072266, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.841796875, - "step": 268, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997150182723999 - }, - { - "episode": 4320, - "epoch": 0.02588345256497825, - "loss/policy_avg": 0.565528929233551, - "lr": 9.828093047034766e-06, - "objective/entropy": 138.6593017578125, - "objective/kl": 32.08763885498047, - "objective/non_score_reward": -1.604382038116455, - "objective/rlhf_reward": -4.813408408228474, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 34.42543029785156, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.427734375, - "step": 269, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0018558502197266 - }, - { - "episode": 4336, - "epoch": 0.0259793172041078, - "loss/policy_avg": 0.4312899708747864, - "lr": 9.827453987730061e-06, - "objective/entropy": 20.17654800415039, - "objective/kl": 23.528181076049805, - "objective/non_score_reward": -1.176409125328064, - "objective/rlhf_reward": -2.5829304478326183, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 20.440711975097656, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7890625, - "step": 270, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9989676475524902 - }, - { - "episode": 4352, - "epoch": 0.026075181843237347, - "loss/policy_avg": 0.20729105174541473, - "lr": 9.826814928425358e-06, - "objective/entropy": 166.21115112304688, - "objective/kl": 31.01326560974121, - "objective/non_score_reward": -1.5506633520126343, - "objective/rlhf_reward": -6.202653288841248, - "objective/scores": 0.0, - "policy/approxkl_avg": 34.41830825805664, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.654296875, - "step": 271, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.003887176513672 - }, - { - "episode": 4368, - "epoch": 0.026171046482366896, - "loss/policy_avg": 3.2944061756134033, - "lr": 9.826175869120655e-06, - "objective/entropy": 28.755096435546875, - "objective/kl": 31.482175827026367, - "objective/non_score_reward": -1.5741088390350342, - "objective/rlhf_reward": -4.917833187667233, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 5.366632461547852, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3701171875, - "step": 272, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0053317546844482 - }, - { - "episode": 4384, - "epoch": 0.02626691112149645, - "loss/policy_avg": 0.23004142940044403, - "lr": 9.825536809815952e-06, - "objective/entropy": 54.82402038574219, - "objective/kl": 32.45307922363281, - "objective/non_score_reward": -1.6226541996002197, - "objective/rlhf_reward": -5.148980966120391, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 31.775432586669922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.525390625, - "step": 273, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9995795488357544 - }, - { - "episode": 4400, - "epoch": 0.026362775760625998, - "loss/policy_avg": -0.08435960114002228, - "lr": 9.824897750511249e-06, - "objective/entropy": 98.25897216796875, - "objective/kl": 28.68474578857422, - "objective/non_score_reward": -1.4342372417449951, - "objective/rlhf_reward": -5.73694920539856, - "objective/scores": 0.0, - "policy/approxkl_avg": 72.97157287597656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.517578125, - "step": 274, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0012598037719727 - }, - { - "episode": 4416, - "epoch": 0.026458640399755547, - "loss/policy_avg": 0.41626134514808655, - "lr": 9.824258691206546e-06, - "objective/entropy": 83.60694885253906, - "objective/kl": 30.977035522460938, - "objective/non_score_reward": -1.548851728439331, - "objective/rlhf_reward": -4.795407152175903, - "objective/scores": 0.35, - "policy/approxkl_avg": 39.04691696166992, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.60546875, - "step": 275, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0021119117736816 - }, - { - "episode": 4432, - "epoch": 0.026554505038885096, - "loss/policy_avg": 0.43957769870758057, - "lr": 9.823619631901841e-06, - "objective/entropy": 127.34529113769531, - "objective/kl": 35.28544616699219, - "objective/non_score_reward": -1.7642724514007568, - "objective/rlhf_reward": -5.606491903872833, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 150.78646850585938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.748046875, - "step": 276, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9970619678497314 - }, - { - "episode": 4448, - "epoch": 0.026650369678014645, - "loss/policy_avg": 0.8086847066879272, - "lr": 9.822980572597138e-06, - "objective/entropy": -119.74644470214844, - "objective/kl": 26.706302642822266, - "objective/non_score_reward": -1.335315227508545, - "objective/rlhf_reward": -3.9412606716156002, - "objective/scores": 0.35, - "policy/approxkl_avg": 65.78569793701172, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.568359375, - "step": 277, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9986886978149414 - }, - { - "episode": 4464, - "epoch": 0.026746234317144194, - "loss/policy_avg": 0.09760895371437073, - "lr": 9.822341513292433e-06, - "objective/entropy": 209.31890869140625, - "objective/kl": 41.666831970214844, - "objective/non_score_reward": -2.083341598510742, - "objective/rlhf_reward": -6.7292466498056225, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 14.525606155395508, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6328125, - "step": 278, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9980053901672363 - }, - { - "episode": 4480, - "epoch": 0.026842098956273742, - "loss/policy_avg": 0.0820450559258461, - "lr": 9.82170245398773e-06, - "objective/entropy": 152.01095581054688, - "objective/kl": 29.104724884033203, - "objective/non_score_reward": -1.4552361965179443, - "objective/rlhf_reward": -4.159085219324218, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 21.12679100036621, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4560546875, - "step": 279, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.99649977684021 - }, - { - "episode": 4496, - "epoch": 0.02693796359540329, - "loss/policy_avg": 0.08112587034702301, - "lr": 9.821063394683027e-06, - "objective/entropy": 49.22539138793945, - "objective/kl": 32.40191650390625, - "objective/non_score_reward": -1.6200958490371704, - "objective/rlhf_reward": -5.029785375209197, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 4.874902725219727, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.404296875, - "step": 280, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0027780532836914 - }, - { - "episode": 4512, - "epoch": 0.02703382823453284, - "loss/policy_avg": 0.41851094365119934, - "lr": 9.820424335378324e-06, - "objective/entropy": 108.13827514648438, - "objective/kl": 44.792015075683594, - "objective/non_score_reward": -2.239600658416748, - "objective/rlhf_reward": -7.133574362072061, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 67.72032165527344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.537109375, - "step": 281, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9979641437530518 - }, - { - "episode": 4528, - "epoch": 0.02712969287366239, - "loss/policy_avg": 0.8327301144599915, - "lr": 9.81978527607362e-06, - "objective/entropy": 70.98486328125, - "objective/kl": 43.82145690917969, - "objective/non_score_reward": -2.191072702407837, - "objective/rlhf_reward": -7.283338430340647, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 1.1268585920333862, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.427734375, - "step": 282, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.002413749694824 - }, - { - "episode": 4544, - "epoch": 0.02722555751279194, - "loss/policy_avg": 0.26003268361091614, - "lr": 9.819146216768916e-06, - "objective/entropy": 59.813140869140625, - "objective/kl": 32.33997344970703, - "objective/non_score_reward": -1.6169987916946411, - "objective/rlhf_reward": -4.643166418346476, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 108.00172424316406, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.60546875, - "step": 283, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.996032953262329 - }, - { - "episode": 4560, - "epoch": 0.027321422151921487, - "loss/policy_avg": 0.06828334182500839, - "lr": 9.818507157464213e-06, - "objective/entropy": 164.7733154296875, - "objective/kl": 36.976539611816406, - "objective/non_score_reward": -1.8488272428512573, - "objective/rlhf_reward": -5.791188750330525, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 22.712989807128906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.498046875, - "step": 284, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.998927354812622 - }, - { - "episode": 4576, - "epoch": 0.027417286791051036, - "loss/policy_avg": 0.346102774143219, - "lr": 9.81786809815951e-06, - "objective/entropy": 141.91213989257812, - "objective/kl": 29.89690589904785, - "objective/non_score_reward": -1.4948452711105347, - "objective/rlhf_reward": -4.5793810248374935, - "objective/scores": 0.35, - "policy/approxkl_avg": 4.914261817932129, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.583984375, - "step": 285, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9991732835769653 - }, - { - "episode": 4592, - "epoch": 0.027513151430180585, - "loss/policy_avg": 0.07111110538244247, - "lr": 9.817229038854806e-06, - "objective/entropy": -41.44879150390625, - "objective/kl": 29.296417236328125, - "objective/non_score_reward": -1.4648208618164062, - "objective/rlhf_reward": -4.4806815172113, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 70.16557312011719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5625, - "step": 286, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982048273086548 - }, - { - "episode": 4608, - "epoch": 0.027609016069310134, - "loss/policy_avg": 0.6204440593719482, - "lr": 9.816589979550103e-06, - "objective/entropy": 10.609687805175781, - "objective/kl": 34.5562744140625, - "objective/non_score_reward": -1.727813720703125, - "objective/rlhf_reward": -5.552005314563198, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 44.11948776245117, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4677734375, - "step": 287, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9960062503814697 - }, - { - "episode": 4624, - "epoch": 0.027704880708439683, - "loss/policy_avg": -0.3703474700450897, - "lr": 9.8159509202454e-06, - "objective/entropy": 16.20748519897461, - "objective/kl": 40.348899841308594, - "objective/non_score_reward": -2.0174450874328613, - "objective/rlhf_reward": -5.6697804689407345, - "objective/scores": 0.6, - "policy/approxkl_avg": 58.94084167480469, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.451171875, - "step": 288, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000488519668579 - }, - { - "episode": 4640, - "epoch": 0.027800745347569232, - "loss/policy_avg": 0.691341757774353, - "lr": 9.815311860940695e-06, - "objective/entropy": 164.64894104003906, - "objective/kl": 35.96034240722656, - "objective/non_score_reward": -1.7980170249938965, - "objective/rlhf_reward": -2.792067980766296, - "objective/scores": 1.1, - "policy/approxkl_avg": 105.621826171875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.53125, - "step": 289, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9972904920578003 - }, - { - "episode": 4656, - "epoch": 0.02789660998669878, - "loss/policy_avg": 0.05122673511505127, - "lr": 9.814672801635992e-06, - "objective/entropy": 143.17758178710938, - "objective/kl": 27.651023864746094, - "objective/non_score_reward": -1.3825511932373047, - "objective/rlhf_reward": -2.6064857586633887, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 30.806257247924805, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.462890625, - "step": 290, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.996225118637085 - }, - { - "episode": 4672, - "epoch": 0.02799247462582833, - "loss/policy_avg": -0.021466929465532303, - "lr": 9.81403374233129e-06, - "objective/entropy": 123.44010925292969, - "objective/kl": 18.645748138427734, - "objective/non_score_reward": -0.9322873950004578, - "objective/rlhf_reward": -2.403636608153505, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 24.915597915649414, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.638671875, - "step": 291, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.995213270187378 - }, - { - "episode": 4688, - "epoch": 0.02808833926495788, - "loss/policy_avg": 0.700859785079956, - "lr": 9.813394683026586e-06, - "objective/entropy": 58.48292922973633, - "objective/kl": 28.2305965423584, - "objective/non_score_reward": -1.411529779434204, - "objective/rlhf_reward": -4.24611941576004, - "objective/scores": 0.35, - "policy/approxkl_avg": 21.04977035522461, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4208984375, - "step": 292, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9981340169906616 - }, - { - "episode": 4704, - "epoch": 0.028184203904087428, - "loss/policy_avg": 0.9605820775032043, - "lr": 9.812755623721883e-06, - "objective/entropy": -33.6519775390625, - "objective/kl": 33.635501861572266, - "objective/non_score_reward": -1.6817750930786133, - "objective/rlhf_reward": -5.065241103590118, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 18.019363403320312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4619140625, - "step": 293, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000504493713379 - }, - { - "episode": 4720, - "epoch": 0.028280068543216977, - "loss/policy_avg": 0.44443511962890625, - "lr": 9.81211656441718e-06, - "objective/entropy": 61.81305694580078, - "objective/kl": 37.54548263549805, - "objective/non_score_reward": -1.8772742748260498, - "objective/rlhf_reward": -6.130494453994137, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 34.736690521240234, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.701171875, - "step": 294, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981006383895874 - }, - { - "episode": 4736, - "epoch": 0.028375933182346526, - "loss/policy_avg": -0.004817202687263489, - "lr": 9.811477505112475e-06, - "objective/entropy": -85.25079345703125, - "objective/kl": 22.125272750854492, - "objective/non_score_reward": -1.1062637567520142, - "objective/rlhf_reward": -3.0658050415262412, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 39.945377349853516, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.619140625, - "step": 295, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001112937927246 - }, - { - "episode": 4752, - "epoch": 0.028471797821476075, - "loss/policy_avg": -0.018911486491560936, - "lr": 9.810838445807772e-06, - "objective/entropy": 187.50953674316406, - "objective/kl": 31.752737045288086, - "objective/non_score_reward": -1.587636947631836, - "objective/rlhf_reward": -4.525718684467386, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 41.095298767089844, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.560546875, - "step": 296, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0318245887756348 - }, - { - "episode": 4768, - "epoch": 0.028567662460605624, - "loss/policy_avg": 0.5813855528831482, - "lr": 9.810199386503069e-06, - "objective/entropy": 13.395767211914062, - "objective/kl": 29.76428985595703, - "objective/non_score_reward": -1.4882144927978516, - "objective/rlhf_reward": -4.219524757067362, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 58.40808868408203, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.62109375, - "step": 297, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9971027374267578 - }, - { - "episode": 4784, - "epoch": 0.028663527099735173, - "loss/policy_avg": 0.25174012780189514, - "lr": 9.809560327198366e-06, - "objective/entropy": 93.99857330322266, - "objective/kl": 31.07823944091797, - "objective/non_score_reward": -1.5539120435714722, - "objective/rlhf_reward": -4.482314721743266, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 56.219329833984375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.767578125, - "step": 298, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9973026514053345 - }, - { - "episode": 4800, - "epoch": 0.02875939173886472, - "loss/policy_avg": -0.05966740474104881, - "lr": 9.808921267893663e-06, - "objective/entropy": 199.3701934814453, - "objective/kl": 26.15532684326172, - "objective/non_score_reward": -1.3077664375305176, - "objective/rlhf_reward": -3.7152936098896827, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 18.272422790527344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.650390625, - "step": 299, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.002091646194458 - }, - { - "episode": 4816, - "epoch": 0.02885525637799427, - "loss/policy_avg": 0.19725301861763, - "lr": 9.808282208588958e-06, - "objective/entropy": 112.11613464355469, - "objective/kl": 33.344722747802734, - "objective/non_score_reward": -1.667236089706421, - "objective/rlhf_reward": -6.668944478034973, - "objective/scores": 0.0, - "policy/approxkl_avg": 29.54242706298828, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.640625, - "step": 300, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0038766860961914 - }, - { - "episode": 4832, - "epoch": 0.02895112101712382, - "loss/policy_avg": -0.17506346106529236, - "lr": 9.807643149284255e-06, - "objective/entropy": 70.48281860351562, - "objective/kl": 29.51511573791504, - "objective/non_score_reward": -1.4757558107376099, - "objective/rlhf_reward": -4.387251400741276, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 12.791141510009766, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4814453125, - "step": 301, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999895691871643 - }, - { - "episode": 4848, - "epoch": 0.029046985656253372, - "loss/policy_avg": 0.38140204548835754, - "lr": 9.80700408997955e-06, - "objective/entropy": 23.643152236938477, - "objective/kl": 27.579925537109375, - "objective/non_score_reward": -1.3789963722229004, - "objective/rlhf_reward": -3.854125951946364, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 8.89024543762207, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58203125, - "step": 302, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9984774589538574 - }, - { - "episode": 4864, - "epoch": 0.02914285029538292, - "loss/policy_avg": 0.18466374278068542, - "lr": 9.806365030674847e-06, - "objective/entropy": -30.63671875, - "objective/kl": 25.678733825683594, - "objective/non_score_reward": -1.2839367389678955, - "objective/rlhf_reward": -3.6199750540577735, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 4.08036470413208, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53515625, - "step": 303, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999252200126648 - }, - { - "episode": 4880, - "epoch": 0.02923871493451247, - "loss/policy_avg": 0.20352232456207275, - "lr": 9.805725971370144e-06, - "objective/entropy": -14.465229034423828, - "objective/kl": 16.88151741027832, - "objective/non_score_reward": -0.8440757989883423, - "objective/rlhf_reward": 1.023696751892567, - "objective/scores": 1.1, - "policy/approxkl_avg": 16.945369720458984, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.51171875, - "step": 304, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.997638463973999 - }, - { - "episode": 4896, - "epoch": 0.02933457957364202, - "loss/policy_avg": 0.36892420053482056, - "lr": 9.80508691206544e-06, - "objective/entropy": 136.53363037109375, - "objective/kl": 30.262548446655273, - "objective/non_score_reward": -1.513127326965332, - "objective/rlhf_reward": -3.1287905319940776, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 30.166175842285156, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.615234375, - "step": 305, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0001025199890137 - }, - { - "episode": 4912, - "epoch": 0.029430444212771568, - "loss/policy_avg": 0.07577557861804962, - "lr": 9.804447852760737e-06, - "objective/entropy": 77.17935943603516, - "objective/kl": 28.32352638244629, - "objective/non_score_reward": -1.4161763191223145, - "objective/rlhf_reward": -4.148933493884739, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.6957955360412598, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.63671875, - "step": 306, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0025148391723633 - }, - { - "episode": 4928, - "epoch": 0.029526308851901117, - "loss/policy_avg": 0.1559610664844513, - "lr": 9.803808793456034e-06, - "objective/entropy": -16.938400268554688, - "objective/kl": 21.827743530273438, - "objective/non_score_reward": -1.091387152671814, - "objective/rlhf_reward": -2.703689043939696, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 7.885660171508789, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.431640625, - "step": 307, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0027849674224854 - }, - { - "episode": 4944, - "epoch": 0.029622173491030666, - "loss/policy_avg": -0.17305535078048706, - "lr": 9.80316973415133e-06, - "objective/entropy": -31.412694931030273, - "objective/kl": 23.805431365966797, - "objective/non_score_reward": -1.1902716159820557, - "objective/rlhf_reward": -3.1569663322606853, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 35.29633331298828, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.654296875, - "step": 308, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0007619857788086 - }, - { - "episode": 4960, - "epoch": 0.029718038130160215, - "loss/policy_avg": 0.13406828045845032, - "lr": 9.802530674846626e-06, - "objective/entropy": 68.0604248046875, - "objective/kl": 31.641517639160156, - "objective/non_score_reward": -1.582075834274292, - "objective/rlhf_reward": -4.949701407042843, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 32.652069091796875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.3017578125, - "step": 309, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9981597661972046 - }, - { - "episode": 4976, - "epoch": 0.029813902769289764, - "loss/policy_avg": 0.3640270233154297, - "lr": 9.801891615541923e-06, - "objective/entropy": 73.73117065429688, - "objective/kl": 22.181957244873047, - "objective/non_score_reward": -1.109097957611084, - "objective/rlhf_reward": -4.436391651630402, - "objective/scores": 0.0, - "policy/approxkl_avg": 24.474929809570312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.77734375, - "step": 310, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9988360404968262 - }, - { - "episode": 4992, - "epoch": 0.029909767408419313, - "loss/policy_avg": 0.598778486251831, - "lr": 9.80125255623722e-06, - "objective/entropy": 77.45819854736328, - "objective/kl": 31.91500473022461, - "objective/non_score_reward": -1.5957502126693726, - "objective/rlhf_reward": -4.558172132047723, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 7.392116546630859, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.595703125, - "step": 311, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9976756572723389 - }, - { - "episode": 5008, - "epoch": 0.03000563204754886, - "loss/policy_avg": -0.14829277992248535, - "lr": 9.800613496932517e-06, - "objective/entropy": 73.91107940673828, - "objective/kl": 22.043235778808594, - "objective/non_score_reward": -1.1021617650985718, - "objective/rlhf_reward": -3.0086471796035763, - "objective/scores": 0.35, - "policy/approxkl_avg": 7.375496864318848, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.548828125, - "step": 312, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0021305084228516 - }, - { - "episode": 5024, - "epoch": 0.03010149668667841, - "loss/policy_avg": 0.34449532628059387, - "lr": 9.799974437627812e-06, - "objective/entropy": 27.04425048828125, - "objective/kl": 31.98007583618164, - "objective/non_score_reward": -1.599003791809082, - "objective/rlhf_reward": -4.996015524864196, - "objective/scores": 0.35, - "policy/approxkl_avg": 53.630210876464844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.529296875, - "step": 313, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9990134239196777 - }, - { - "episode": 5040, - "epoch": 0.03019736132580796, - "loss/policy_avg": 0.029857225716114044, - "lr": 9.799335378323109e-06, - "objective/entropy": 147.96096801757812, - "objective/kl": 27.342838287353516, - "objective/non_score_reward": -1.3671419620513916, - "objective/rlhf_reward": -4.017969946475372, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 18.108400344848633, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.638671875, - "step": 314, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988747835159302 - }, - { - "episode": 5056, - "epoch": 0.03029322596493751, - "loss/policy_avg": 0.05283927917480469, - "lr": 9.798696319018406e-06, - "objective/entropy": -46.846099853515625, - "objective/kl": 30.715242385864258, - "objective/non_score_reward": -1.535762071609497, - "objective/rlhf_reward": -4.538928542200642, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 66.26033020019531, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6484375, - "step": 315, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9992979764938354 - }, - { - "episode": 5072, - "epoch": 0.030389090604067057, - "loss/policy_avg": 0.2858242094516754, - "lr": 9.798057259713703e-06, - "objective/entropy": -156.9435577392578, - "objective/kl": 31.284622192382812, - "objective/non_score_reward": -1.5642311573028564, - "objective/rlhf_reward": -4.915288856535583, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 74.38943481445312, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7421875, - "step": 316, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9992833137512207 - }, - { - "episode": 5088, - "epoch": 0.030484955243196606, - "loss/policy_avg": 0.28274843096733093, - "lr": 9.797418200409e-06, - "objective/entropy": -214.69573974609375, - "objective/kl": 22.27606201171875, - "objective/non_score_reward": -1.1138031482696533, - "objective/rlhf_reward": -2.3325063607850414, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 35.48945236206055, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.505859375, - "step": 317, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9962687492370605 - }, - { - "episode": 5104, - "epoch": 0.030580819882326155, - "loss/policy_avg": -0.08736838400363922, - "lr": 9.796779141104296e-06, - "objective/entropy": -18.148971557617188, - "objective/kl": 27.546077728271484, - "objective/non_score_reward": -1.377303957939148, - "objective/rlhf_reward": -4.1306134844697535, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 76.84832000732422, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.6171875, - "step": 318, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0014290809631348 - }, - { - "episode": 5120, - "epoch": 0.030676684521455704, - "loss/policy_avg": 0.031098078936338425, - "lr": 9.796140081799592e-06, - "objective/entropy": 103.30211639404297, - "objective/kl": 27.747032165527344, - "objective/non_score_reward": -1.3873515129089355, - "objective/rlhf_reward": -4.033634447845158, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 54.69970703125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65234375, - "step": 319, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9985463619232178 - }, - { - "episode": 5136, - "epoch": 0.030772549160585253, - "loss/policy_avg": 0.3622899651527405, - "lr": 9.795501022494888e-06, - "objective/entropy": 66.0567398071289, - "objective/kl": 26.39444351196289, - "objective/non_score_reward": -1.3197221755981445, - "objective/rlhf_reward": -3.7226295759349615, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 5.640605449676514, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484375, - "step": 320, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9992847442626953 - }, - { - "episode": 5152, - "epoch": 0.030868413799714802, - "loss/policy_avg": -0.10469883680343628, - "lr": 9.794861963190185e-06, - "objective/entropy": 35.81920623779297, - "objective/kl": 25.668739318847656, - "objective/non_score_reward": -1.2834370136260986, - "objective/rlhf_reward": -3.7337480843067166, - "objective/scores": 0.35, - "policy/approxkl_avg": 5.808808326721191, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6484375, - "step": 321, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999007225036621 - }, - { - "episode": 5168, - "epoch": 0.03096427843884435, - "loss/policy_avg": -0.2741212248802185, - "lr": 9.794222903885482e-06, - "objective/entropy": 52.38888168334961, - "objective/kl": 34.969974517822266, - "objective/non_score_reward": -1.748498797416687, - "objective/rlhf_reward": -5.652359655409484, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 8.913843154907227, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3798828125, - "step": 322, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0103840827941895 - }, - { - "episode": 5184, - "epoch": 0.0310601430779739, - "loss/policy_avg": 0.30122414231300354, - "lr": 9.793583844580777e-06, - "objective/entropy": 134.16075134277344, - "objective/kl": 25.608116149902344, - "objective/non_score_reward": -1.280405879020691, - "objective/rlhf_reward": -3.1742123318480804, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 74.33633422851562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.603515625, - "step": 323, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.014462471008301 - }, - { - "episode": 5200, - "epoch": 0.03115600771710345, - "loss/policy_avg": 0.26204991340637207, - "lr": 9.792944785276074e-06, - "objective/entropy": 2.559833526611328, - "objective/kl": 25.519519805908203, - "objective/non_score_reward": -1.2759759426116943, - "objective/rlhf_reward": -3.74465426180212, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 92.09954071044922, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.57421875, - "step": 324, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998145580291748 - }, - { - "episode": 5216, - "epoch": 0.031251872356233, - "loss/policy_avg": 0.18864840269088745, - "lr": 9.792305725971371e-06, - "objective/entropy": 48.99184036254883, - "objective/kl": 28.022377014160156, - "objective/non_score_reward": -1.4011187553405762, - "objective/rlhf_reward": -4.123522403653025, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 22.120746612548828, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.60546875, - "step": 325, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984712600708008 - }, - { - "episode": 5232, - "epoch": 0.03134773699536255, - "loss/policy_avg": 0.42162489891052246, - "lr": 9.791666666666666e-06, - "objective/entropy": -129.23065185546875, - "objective/kl": 31.687660217285156, - "objective/non_score_reward": -1.5843830108642578, - "objective/rlhf_reward": -4.821760052236256, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 111.98194885253906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.587890625, - "step": 326, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.996734857559204 - }, - { - "episode": 5248, - "epoch": 0.0314436016344921, - "loss/policy_avg": -0.07900102436542511, - "lr": 9.791027607361963e-06, - "objective/entropy": 31.351696014404297, - "objective/kl": 27.038206100463867, - "objective/non_score_reward": -1.3519103527069092, - "objective/rlhf_reward": -5.407641291618347, - "objective/scores": 0.0, - "policy/approxkl_avg": 9.7061767578125, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4423828125, - "step": 327, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0043118000030518 - }, - { - "episode": 5264, - "epoch": 0.03153946627362165, - "loss/policy_avg": 0.16587843000888824, - "lr": 9.79038854805726e-06, - "objective/entropy": 143.86651611328125, - "objective/kl": 27.42593765258789, - "objective/non_score_reward": -1.3712968826293945, - "objective/rlhf_reward": -4.125937962268276, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 119.49800872802734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.767578125, - "step": 328, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999497890472412 - }, - { - "episode": 5280, - "epoch": 0.0316353309127512, - "loss/policy_avg": 0.29106539487838745, - "lr": 9.789749488752557e-06, - "objective/entropy": 67.8651351928711, - "objective/kl": 32.114479064941406, - "objective/non_score_reward": -1.6057239770889282, - "objective/rlhf_reward": -5.08126013567987, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 7.976801872253418, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.470703125, - "step": 329, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0011985301971436 - }, - { - "episode": 5296, - "epoch": 0.031731195551880746, - "loss/policy_avg": 0.5780457258224487, - "lr": 9.789110429447854e-06, - "objective/entropy": 104.15371704101562, - "objective/kl": 30.92220687866211, - "objective/non_score_reward": -1.5461102724075317, - "objective/rlhf_reward": -3.784441030025482, - "objective/scores": 0.6, - "policy/approxkl_avg": 52.566375732421875, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.55859375, - "step": 330, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9993044137954712 - }, - { - "episode": 5312, - "epoch": 0.031827060191010295, - "loss/policy_avg": 0.24728742241859436, - "lr": 9.78847137014315e-06, - "objective/entropy": -95.75634765625, - "objective/kl": 30.755779266357422, - "objective/non_score_reward": -1.5377888679504395, - "objective/rlhf_reward": -4.791905486319942, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 32.567970275878906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.666015625, - "step": 331, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9991464614868164 - }, - { - "episode": 5328, - "epoch": 0.031922924830139844, - "loss/policy_avg": 1.9531396627426147, - "lr": 9.787832310838446e-06, - "objective/entropy": 18.057151794433594, - "objective/kl": 21.966590881347656, - "objective/non_score_reward": -1.0983295440673828, - "objective/rlhf_reward": -2.993318116664886, - "objective/scores": 0.35, - "policy/approxkl_avg": 11.555295944213867, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.716796875, - "step": 332, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.004971981048584 - }, - { - "episode": 5344, - "epoch": 0.03201878946926939, - "loss/policy_avg": 0.0304682869464159, - "lr": 9.787193251533743e-06, - "objective/entropy": -100.86114501953125, - "objective/kl": 21.19540023803711, - "objective/non_score_reward": -1.0597699880599976, - "objective/rlhf_reward": -2.8604777837670863, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 36.17786407470703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.46875, - "step": 333, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997187852859497 - }, - { - "episode": 5360, - "epoch": 0.03211465410839894, - "loss/policy_avg": 0.2974792718887329, - "lr": 9.78655419222904e-06, - "objective/entropy": 59.0064697265625, - "objective/kl": 23.83527183532715, - "objective/non_score_reward": -1.1917636394500732, - "objective/rlhf_reward": -3.2861017016724343, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 27.08124542236328, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.76171875, - "step": 334, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0005297660827637 - }, - { - "episode": 5376, - "epoch": 0.03221051874752849, - "loss/policy_avg": 0.20310130715370178, - "lr": 9.785915132924337e-06, - "objective/entropy": 51.579200744628906, - "objective/kl": 26.064043045043945, - "objective/non_score_reward": -1.3032021522521973, - "objective/rlhf_reward": -3.656549363341883, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 3.1224026679992676, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.51171875, - "step": 335, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0020689964294434 - }, - { - "episode": 5392, - "epoch": 0.03230638338665804, - "loss/policy_avg": -0.22360196709632874, - "lr": 9.785276073619633e-06, - "objective/entropy": 8.019195556640625, - "objective/kl": 34.267356872558594, - "objective/non_score_reward": -1.7133680582046509, - "objective/rlhf_reward": -5.40287409266983, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 27.402694702148438, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.517578125, - "step": 336, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.99736487865448 - }, - { - "episode": 5408, - "epoch": 0.03240224802578759, - "loss/policy_avg": 0.394004225730896, - "lr": 9.784637014314929e-06, - "objective/entropy": -7.316375732421875, - "objective/kl": 34.60337829589844, - "objective/non_score_reward": -1.7301688194274902, - "objective/rlhf_reward": -3.9969565018427105, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 60.58606719970703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.609375, - "step": 337, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9990360736846924 - }, - { - "episode": 5424, - "epoch": 0.03249811266491714, - "loss/policy_avg": 0.08118537068367004, - "lr": 9.783997955010226e-06, - "objective/entropy": 3.808826446533203, - "objective/kl": 33.9757080078125, - "objective/non_score_reward": -1.6987853050231934, - "objective/rlhf_reward": -5.3713093592720895, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 49.47349548339844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.546875, - "step": 338, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9974095821380615 - }, - { - "episode": 5440, - "epoch": 0.03259397730404669, - "loss/policy_avg": 0.1250596046447754, - "lr": 9.783358895705522e-06, - "objective/entropy": -42.7471809387207, - "objective/kl": 27.222618103027344, - "objective/non_score_reward": -1.361130952835083, - "objective/rlhf_reward": -3.9287524459683265, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 8.669515609741211, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.509765625, - "step": 339, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0002639293670654 - }, - { - "episode": 5456, - "epoch": 0.032689841943176236, - "loss/policy_avg": 1.2977867126464844, - "lr": 9.78271983640082e-06, - "objective/entropy": -60.51675796508789, - "objective/kl": 27.726932525634766, - "objective/non_score_reward": -1.3863465785980225, - "objective/rlhf_reward": -4.064433994706034, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 52.59510803222656, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4345703125, - "step": 340, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984922409057617 - }, - { - "episode": 5472, - "epoch": 0.032785706582305785, - "loss/policy_avg": 0.10771232098340988, - "lr": 9.782080777096116e-06, - "objective/entropy": 39.22501754760742, - "objective/kl": 38.581573486328125, - "objective/non_score_reward": -1.9290788173675537, - "objective/rlhf_reward": -6.374679616003662, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 16.336502075195312, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.599609375, - "step": 341, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990124702453613 - }, - { - "episode": 5488, - "epoch": 0.032881571221435334, - "loss/policy_avg": 0.029969744384288788, - "lr": 9.781441717791413e-06, - "objective/entropy": 54.763675689697266, - "objective/kl": 27.586057662963867, - "objective/non_score_reward": -1.379302978515625, - "objective/rlhf_reward": -3.7838785807291666, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 29.997591018676758, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4462890625, - "step": 342, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992620944976807 - }, - { - "episode": 5504, - "epoch": 0.03297743586056488, - "loss/policy_avg": -0.003006638027727604, - "lr": 9.780802658486708e-06, - "objective/entropy": 4.6327056884765625, - "objective/kl": 25.01122283935547, - "objective/non_score_reward": -1.250560998916626, - "objective/rlhf_reward": -3.054833005146916, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.332850694656372, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.51171875, - "step": 343, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0042150020599365 - }, - { - "episode": 5520, - "epoch": 0.03307330049969443, - "loss/policy_avg": -0.2595655918121338, - "lr": 9.780163599182005e-06, - "objective/entropy": -9.382579803466797, - "objective/kl": 25.310394287109375, - "objective/non_score_reward": -1.2655197381973267, - "objective/rlhf_reward": -3.611480812640533, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 35.86376190185547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.708984375, - "step": 344, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9991991519927979 - }, - { - "episode": 5536, - "epoch": 0.03316916513882398, - "loss/policy_avg": 1.6723182201385498, - "lr": 9.7795245398773e-06, - "objective/entropy": 167.249267578125, - "objective/kl": 38.30883026123047, - "objective/non_score_reward": -1.915441632270813, - "objective/rlhf_reward": -6.283164360610348, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 27.648231506347656, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.521484375, - "step": 345, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9966659545898438 - }, - { - "episode": 5552, - "epoch": 0.03326502977795353, - "loss/policy_avg": 0.21136921644210815, - "lr": 9.778885480572597e-06, - "objective/entropy": 202.48263549804688, - "objective/kl": 28.62633514404297, - "objective/non_score_reward": -1.4313167333602905, - "objective/rlhf_reward": -4.169007628169611, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 28.591995239257812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7421875, - "step": 346, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9962763786315918 - }, - { - "episode": 5568, - "epoch": 0.03336089441708308, - "loss/policy_avg": 0.030091844499111176, - "lr": 9.778246421267894e-06, - "objective/entropy": 178.1235809326172, - "objective/kl": 37.731300354003906, - "objective/non_score_reward": -1.8865652084350586, - "objective/rlhf_reward": -5.990001528468683, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 17.381601333618164, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.52734375, - "step": 347, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.001326322555542 - }, - { - "episode": 5584, - "epoch": 0.03345675905621263, - "loss/policy_avg": 0.40717682242393494, - "lr": 9.777607361963191e-06, - "objective/entropy": 90.73904418945312, - "objective/kl": 31.88462257385254, - "objective/non_score_reward": -1.594231128692627, - "objective/rlhf_reward": -5.0176747677072715, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 37.96768569946289, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5703125, - "step": 348, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991170167922974 - }, - { - "episode": 5600, - "epoch": 0.033552623695342176, - "loss/policy_avg": 0.5422201156616211, - "lr": 9.776968302658488e-06, - "objective/entropy": 80.41102600097656, - "objective/kl": 34.64447021484375, - "objective/non_score_reward": -1.7322235107421875, - "objective/rlhf_reward": -5.478295783610687, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 117.23408508300781, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.54296875, - "step": 349, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983158111572266 - }, - { - "episode": 5616, - "epoch": 0.033648488334471725, - "loss/policy_avg": 0.3756037950515747, - "lr": 9.776329243353783e-06, - "objective/entropy": 61.65838623046875, - "objective/kl": 44.269325256347656, - "objective/non_score_reward": -2.213466167449951, - "objective/rlhf_reward": -7.40326676806961, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 19.3502254486084, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.583984375, - "step": 350, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9988865852355957 - }, - { - "episode": 5632, - "epoch": 0.033744352973601274, - "loss/policy_avg": 0.9775654673576355, - "lr": 9.77569018404908e-06, - "objective/entropy": 57.90337371826172, - "objective/kl": 41.80830383300781, - "objective/non_score_reward": -2.0904150009155273, - "objective/rlhf_reward": -6.628326908747354, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 84.0235824584961, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.84765625, - "step": 351, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9960914850234985 - }, - { - "episode": 5648, - "epoch": 0.03384021761273082, - "loss/policy_avg": -0.20816992223262787, - "lr": 9.775051124744377e-06, - "objective/entropy": -118.41542053222656, - "objective/kl": 23.201061248779297, - "objective/non_score_reward": -1.160053014755249, - "objective/rlhf_reward": -2.8153834894028416, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 4.062729835510254, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.4453125, - "step": 352, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0035219192504883 - }, - { - "episode": 5664, - "epoch": 0.03393608225186037, - "loss/policy_avg": 0.035901207476854324, - "lr": 9.774412065439674e-06, - "objective/entropy": 154.33920288085938, - "objective/kl": 28.773828506469727, - "objective/non_score_reward": -1.4386913776397705, - "objective/rlhf_reward": -2.8310468539011207, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 25.329944610595703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4755859375, - "step": 353, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0003747940063477 - }, - { - "episode": 5680, - "epoch": 0.03403194689098992, - "loss/policy_avg": 0.7185342311859131, - "lr": 9.77377300613497e-06, - "objective/entropy": 45.80010986328125, - "objective/kl": 35.51177215576172, - "objective/non_score_reward": -1.7755887508392334, - "objective/rlhf_reward": -5.586583339961704, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 69.95939636230469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.51953125, - "step": 354, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.996915340423584 - }, - { - "episode": 5696, - "epoch": 0.03412781153011947, - "loss/policy_avg": 0.871320903301239, - "lr": 9.773133946830267e-06, - "objective/entropy": 136.34942626953125, - "objective/kl": 37.25979995727539, - "objective/non_score_reward": -1.862990140914917, - "objective/rlhf_reward": -5.504549334721501, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 58.879180908203125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66796875, - "step": 355, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9990177154541016 - }, - { - "episode": 5712, - "epoch": 0.03422367616924902, - "loss/policy_avg": 0.14556461572647095, - "lr": 9.772494887525563e-06, - "objective/entropy": -10.28516960144043, - "objective/kl": 29.231609344482422, - "objective/non_score_reward": -1.461580514907837, - "objective/rlhf_reward": -4.021493013176035, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 39.2762451171875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5859375, - "step": 356, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9985218048095703 - }, - { - "episode": 5728, - "epoch": 0.03431954080837857, - "loss/policy_avg": 0.27659082412719727, - "lr": 9.77185582822086e-06, - "objective/entropy": -36.31108093261719, - "objective/kl": 32.386661529541016, - "objective/non_score_reward": -1.619333028793335, - "objective/rlhf_reward": -6.47733199596405, - "objective/scores": 0.0, - "policy/approxkl_avg": 10.265704154968262, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.71484375, - "step": 357, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9992687702178955 - }, - { - "episode": 5744, - "epoch": 0.03441540544750812, - "loss/policy_avg": 0.10546956956386566, - "lr": 9.771216768916156e-06, - "objective/entropy": 79.19872283935547, - "objective/kl": 22.353626251220703, - "objective/non_score_reward": -1.1176813840866089, - "objective/rlhf_reward": -3.0201275154069513, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 20.73809051513672, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4873046875, - "step": 358, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9974703788757324 - }, - { - "episode": 5760, - "epoch": 0.034511270086637666, - "loss/policy_avg": 0.5648351311683655, - "lr": 9.770577709611453e-06, - "objective/entropy": 38.47356033325195, - "objective/kl": 23.87390899658203, - "objective/non_score_reward": -1.1936955451965332, - "objective/rlhf_reward": -3.4331463485056455, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 8.14659595489502, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53125, - "step": 359, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0011978149414062 - }, - { - "episode": 5776, - "epoch": 0.034607134725767215, - "loss/policy_avg": 0.5912380814552307, - "lr": 9.76993865030675e-06, - "objective/entropy": 116.97152709960938, - "objective/kl": 40.231689453125, - "objective/non_score_reward": -2.011584758758545, - "objective/rlhf_reward": -6.565385702069163, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 117.33955383300781, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.666015625, - "step": 360, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9969582557678223 - }, - { - "episode": 5792, - "epoch": 0.034702999364896764, - "loss/policy_avg": -0.019477106630802155, - "lr": 9.769299591002045e-06, - "objective/entropy": -144.96791076660156, - "objective/kl": 27.773448944091797, - "objective/non_score_reward": -1.3886725902557373, - "objective/rlhf_reward": -5.554690062999725, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.48216438293457, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.626953125, - "step": 361, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000136137008667 - }, - { - "episode": 5808, - "epoch": 0.03479886400402631, - "loss/policy_avg": -0.5155759453773499, - "lr": 9.768660531697342e-06, - "objective/entropy": 78.00074768066406, - "objective/kl": 34.501590728759766, - "objective/non_score_reward": -1.7250795364379883, - "objective/rlhf_reward": -5.521715917674404, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 122.40145874023438, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.513671875, - "step": 362, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.02427077293396 - }, - { - "episode": 5824, - "epoch": 0.03489472864315586, - "loss/policy_avg": 0.3520805537700653, - "lr": 9.768021472392639e-06, - "objective/entropy": -66.29779815673828, - "objective/kl": 23.767650604248047, - "objective/non_score_reward": -1.188382625579834, - "objective/rlhf_reward": -3.3535303235054013, - "objective/scores": 0.35, - "policy/approxkl_avg": 66.86349487304688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.51953125, - "step": 363, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9973565340042114 - }, - { - "episode": 5840, - "epoch": 0.03499059328228541, - "loss/policy_avg": 0.25808075070381165, - "lr": 9.767382413087936e-06, - "objective/entropy": 55.69321060180664, - "objective/kl": 32.73713684082031, - "objective/non_score_reward": -1.6368569135665894, - "objective/rlhf_reward": -4.147427594661712, - "objective/scores": 0.6, - "policy/approxkl_avg": 17.00968360900879, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70703125, - "step": 364, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998908519744873 - }, - { - "episode": 5856, - "epoch": 0.03508645792141496, - "loss/policy_avg": -0.33678027987480164, - "lr": 9.766743353783233e-06, - "objective/entropy": 63.459205627441406, - "objective/kl": 36.74503707885742, - "objective/non_score_reward": -1.837251901626587, - "objective/rlhf_reward": -5.226301344410453, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 63.5507926940918, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7734375, - "step": 365, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0005531311035156 - }, - { - "episode": 5872, - "epoch": 0.03518232256054451, - "loss/policy_avg": 0.397920161485672, - "lr": 9.76610429447853e-06, - "objective/entropy": -11.37314224243164, - "objective/kl": 32.99299240112305, - "objective/non_score_reward": -1.6496496200561523, - "objective/rlhf_reward": -5.174766202171413, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 28.19782257080078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.65625, - "step": 366, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9984650611877441 - }, - { - "episode": 5888, - "epoch": 0.03527818719967406, - "loss/policy_avg": 0.5101684331893921, - "lr": 9.765465235173825e-06, - "objective/entropy": 122.12913513183594, - "objective/kl": 39.20099639892578, - "objective/non_score_reward": -1.9600497484207153, - "objective/rlhf_reward": -6.480949008201046, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 10.180255889892578, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.45703125, - "step": 367, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9976162910461426 - }, - { - "episode": 5904, - "epoch": 0.03537405183880361, - "loss/policy_avg": -0.46757811307907104, - "lr": 9.764826175869122e-06, - "objective/entropy": -108.47764587402344, - "objective/kl": 25.862443923950195, - "objective/non_score_reward": -1.2931220531463623, - "objective/rlhf_reward": -3.6162289073138982, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 2.3750016689300537, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.703125, - "step": 368, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0032150745391846 - }, - { - "episode": 5920, - "epoch": 0.035469916477933155, - "loss/policy_avg": 0.12928390502929688, - "lr": 9.764187116564417e-06, - "objective/entropy": 47.25078201293945, - "objective/kl": 23.20449447631836, - "objective/non_score_reward": -1.1602246761322021, - "objective/rlhf_reward": -2.240898942947388, - "objective/scores": 0.6, - "policy/approxkl_avg": 2.1992838382720947, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.544921875, - "step": 369, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0065484046936035 - }, - { - "episode": 5936, - "epoch": 0.035565781117062704, - "loss/policy_avg": 0.15939241647720337, - "lr": 9.763548057259714e-06, - "objective/entropy": -19.609264373779297, - "objective/kl": 28.25977325439453, - "objective/non_score_reward": -1.4129884243011475, - "objective/rlhf_reward": -4.273351618138653, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 59.99807357788086, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66796875, - "step": 370, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0036120414733887 - }, - { - "episode": 5952, - "epoch": 0.03566164575619225, - "loss/policy_avg": 0.1767190843820572, - "lr": 9.76290899795501e-06, - "objective/entropy": -11.536600112915039, - "objective/kl": 36.28870391845703, - "objective/non_score_reward": -1.8144354820251465, - "objective/rlhf_reward": -7.257741451263428, - "objective/scores": 0.0, - "policy/approxkl_avg": 11.846475601196289, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.611328125, - "step": 371, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99981689453125 - }, - { - "episode": 5968, - "epoch": 0.0357575103953218, - "loss/policy_avg": 0.3314260244369507, - "lr": 9.762269938650308e-06, - "objective/entropy": -30.279476165771484, - "objective/kl": 28.756494522094727, - "objective/non_score_reward": -1.4378247261047363, - "objective/rlhf_reward": -4.300700943084106, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 39.342529296875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.623046875, - "step": 372, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998626708984375 - }, - { - "episode": 5984, - "epoch": 0.03585337503445135, - "loss/policy_avg": 0.18494009971618652, - "lr": 9.761630879345604e-06, - "objective/entropy": 68.65098571777344, - "objective/kl": 36.555747985839844, - "objective/non_score_reward": -1.8277872800827026, - "objective/rlhf_reward": -5.486320610317301, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 10.346623420715332, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.517578125, - "step": 373, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000062942504883 - }, - { - "episode": 6000, - "epoch": 0.03594923967358091, - "loss/policy_avg": -0.05254024267196655, - "lr": 9.7609918200409e-06, - "objective/entropy": -30.816913604736328, - "objective/kl": 26.80430793762207, - "objective/non_score_reward": -1.3402154445648193, - "objective/rlhf_reward": -3.845090114864048, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 3.3415722846984863, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.583984375, - "step": 374, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991049766540527 - }, - { - "episode": 6016, - "epoch": 0.036045104312710456, - "loss/policy_avg": 0.48673489689826965, - "lr": 9.760352760736196e-06, - "objective/entropy": -54.172760009765625, - "objective/kl": 26.726612091064453, - "objective/non_score_reward": -1.3363306522369385, - "objective/rlhf_reward": -0.945322489738464, - "objective/scores": 1.1, - "policy/approxkl_avg": 36.357444763183594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.708984375, - "step": 375, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999312400817871 - }, - { - "episode": 6032, - "epoch": 0.036140968951840005, - "loss/policy_avg": -0.06733483076095581, - "lr": 9.759713701431493e-06, - "objective/entropy": 135.20721435546875, - "objective/kl": 37.13209915161133, - "objective/non_score_reward": -1.856605052947998, - "objective/rlhf_reward": -4.5027009590875835, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 17.97521209716797, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4873046875, - "step": 376, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.002077341079712 - }, - { - "episode": 6048, - "epoch": 0.036236833590969554, - "loss/policy_avg": -0.041654448956251144, - "lr": 9.75907464212679e-06, - "objective/entropy": -167.12548828125, - "objective/kl": 25.773399353027344, - "objective/non_score_reward": -1.2886700630187988, - "objective/rlhf_reward": -0.7546801328659054, - "objective/scores": 1.1, - "policy/approxkl_avg": 0.800922691822052, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.521484375, - "step": 377, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000338315963745 - }, - { - "episode": 6064, - "epoch": 0.0363326982300991, - "loss/policy_avg": 0.03024141490459442, - "lr": 9.758435582822087e-06, - "objective/entropy": -73.82417297363281, - "objective/kl": 26.33017349243164, - "objective/non_score_reward": -1.3165086507797241, - "objective/rlhf_reward": -3.14332831122068, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 23.01593780517578, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.63671875, - "step": 378, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002521276473999 - }, - { - "episode": 6080, - "epoch": 0.03642856286922865, - "loss/policy_avg": 0.285569429397583, - "lr": 9.757796523517384e-06, - "objective/entropy": -111.42575073242188, - "objective/kl": 28.885374069213867, - "objective/non_score_reward": -1.4442687034606934, - "objective/rlhf_reward": -4.398472824183804, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 79.57511901855469, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.61328125, - "step": 379, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9979077577590942 - }, - { - "episode": 6096, - "epoch": 0.0365244275083582, - "loss/policy_avg": -0.022392742335796356, - "lr": 9.75715746421268e-06, - "objective/entropy": -79.86695098876953, - "objective/kl": 17.694236755371094, - "objective/non_score_reward": -0.8847118616104126, - "objective/rlhf_reward": -1.7140187576142063, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 4.339657306671143, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.767578125, - "step": 380, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0013508796691895 - }, - { - "episode": 6112, - "epoch": 0.03662029214748775, - "loss/policy_avg": 0.4459357261657715, - "lr": 9.756518404907976e-06, - "objective/entropy": -148.62872314453125, - "objective/kl": 21.098934173583984, - "objective/non_score_reward": -1.054946780204773, - "objective/rlhf_reward": 0.1802129983901981, - "objective/scores": 1.1, - "policy/approxkl_avg": 6.359186172485352, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.576171875, - "step": 381, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9992458820343018 - }, - { - "episode": 6128, - "epoch": 0.0367161567866173, - "loss/policy_avg": -0.012147974222898483, - "lr": 9.755879345603273e-06, - "objective/entropy": 152.35232543945312, - "objective/kl": 31.486684799194336, - "objective/non_score_reward": -1.5743342638015747, - "objective/rlhf_reward": -3.3736180409204692, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 35.51153564453125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5859375, - "step": 382, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999483585357666 - }, - { - "episode": 6144, - "epoch": 0.03681202142574685, - "loss/policy_avg": 0.012859173119068146, - "lr": 9.75524028629857e-06, - "objective/entropy": 26.343887329101562, - "objective/kl": 33.34328079223633, - "objective/non_score_reward": -1.6671642065048218, - "objective/rlhf_reward": -4.721245358662541, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 22.614994049072266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.82421875, - "step": 383, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0018911361694336 - }, - { - "episode": 6160, - "epoch": 0.0369078860648764, - "loss/policy_avg": 0.21653258800506592, - "lr": 9.754601226993867e-06, - "objective/entropy": 109.49678039550781, - "objective/kl": 43.73469543457031, - "objective/non_score_reward": -2.186734676361084, - "objective/rlhf_reward": -7.296340326876983, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 31.000137329101562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.689453125, - "step": 384, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001706123352051 - }, - { - "episode": 6176, - "epoch": 0.037003750704005946, - "loss/policy_avg": 0.17637991905212402, - "lr": 9.753962167689162e-06, - "objective/entropy": -57.256038665771484, - "objective/kl": 20.548786163330078, - "objective/non_score_reward": -1.0274393558502197, - "objective/rlhf_reward": -1.9870514891305304, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 7.945226669311523, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.71484375, - "step": 385, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000217914581299 - }, - { - "episode": 6192, - "epoch": 0.037099615343135495, - "loss/policy_avg": 0.23474755883216858, - "lr": 9.753323108384459e-06, - "objective/entropy": -67.67970275878906, - "objective/kl": 29.886417388916016, - "objective/non_score_reward": -1.4943209886550903, - "objective/rlhf_reward": -4.461511933597263, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 30.2872314453125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.517578125, - "step": 386, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9968770742416382 - }, - { - "episode": 6208, - "epoch": 0.037195479982265044, - "loss/policy_avg": 3.0326309204101562, - "lr": 9.752684049079756e-06, - "objective/entropy": -30.304298400878906, - "objective/kl": 34.21199035644531, - "objective/non_score_reward": -1.710599660873413, - "objective/rlhf_reward": -5.391800324530944, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 191.66567993164062, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62109375, - "step": 387, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998448133468628 - }, - { - "episode": 6224, - "epoch": 0.03729134462139459, - "loss/policy_avg": 0.020012550055980682, - "lr": 9.752044989775053e-06, - "objective/entropy": -44.4876594543457, - "objective/kl": 30.23657989501953, - "objective/non_score_reward": -1.5118290185928345, - "objective/rlhf_reward": -4.099904905037816, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 21.57486915588379, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.767578125, - "step": 388, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.002869129180908 - }, - { - "episode": 6240, - "epoch": 0.03738720926052414, - "loss/policy_avg": 0.33562996983528137, - "lr": 9.751405930470348e-06, - "objective/entropy": -154.47891235351562, - "objective/kl": 18.6168155670166, - "objective/non_score_reward": -0.9308407306671143, - "objective/rlhf_reward": -3.723362982273102, - "objective/scores": 0.0, - "policy/approxkl_avg": 13.14146614074707, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.65625, - "step": 389, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0002055168151855 - }, - { - "episode": 6256, - "epoch": 0.03748307389965369, - "loss/policy_avg": 0.037651438266038895, - "lr": 9.750766871165645e-06, - "objective/entropy": -6.050981521606445, - "objective/kl": 26.29869270324707, - "objective/non_score_reward": -1.3149347305297852, - "objective/rlhf_reward": -5.25973904132843, - "objective/scores": 0.0, - "policy/approxkl_avg": 27.001697540283203, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.759765625, - "step": 390, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982492923736572 - }, - { - "episode": 6272, - "epoch": 0.03757893853878324, - "loss/policy_avg": 0.1277342140674591, - "lr": 9.750127811860941e-06, - "objective/entropy": -114.59310913085938, - "objective/kl": 33.31782531738281, - "objective/non_score_reward": -1.6658912897109985, - "objective/rlhf_reward": -3.739846025348875, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 30.69461441040039, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.755859375, - "step": 391, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9998853206634521 - }, - { - "episode": 6288, - "epoch": 0.03767480317791279, - "loss/policy_avg": 0.08161749690771103, - "lr": 9.749488752556238e-06, - "objective/entropy": 28.02770233154297, - "objective/kl": 25.580188751220703, - "objective/non_score_reward": -1.279009461402893, - "objective/rlhf_reward": -3.6654397054627985, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 11.5637845993042, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3642578125, - "step": 392, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9969701766967773 - }, - { - "episode": 6304, - "epoch": 0.03777066781704234, - "loss/policy_avg": 0.013617899268865585, - "lr": 9.748849693251534e-06, - "objective/entropy": 137.66958618164062, - "objective/kl": 36.88829040527344, - "objective/non_score_reward": -1.8444143533706665, - "objective/rlhf_reward": -5.999055602637631, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 3.8839926719665527, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.703125, - "step": 393, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9998722076416016 - }, - { - "episode": 6320, - "epoch": 0.037866532456171886, - "loss/policy_avg": 0.7664667963981628, - "lr": 9.74821063394683e-06, - "objective/entropy": 12.1875, - "objective/kl": 27.703767776489258, - "objective/non_score_reward": -1.385188341140747, - "objective/rlhf_reward": -4.181503379081173, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 29.00311279296875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.64453125, - "step": 394, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983487129211426 - }, - { - "episode": 6336, - "epoch": 0.037962397095301435, - "loss/policy_avg": 0.13891640305519104, - "lr": 9.747571574642127e-06, - "objective/entropy": -52.291236877441406, - "objective/kl": 29.62856101989746, - "objective/non_score_reward": -1.4814281463623047, - "objective/rlhf_reward": -4.10088383701713, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 45.48643112182617, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.763671875, - "step": 395, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9967341423034668 - }, - { - "episode": 6352, - "epoch": 0.038058261734430984, - "loss/policy_avg": -0.5259265899658203, - "lr": 9.746932515337424e-06, - "objective/entropy": -14.848602294921875, - "objective/kl": 36.51825714111328, - "objective/non_score_reward": -1.8259128332138062, - "objective/rlhf_reward": -5.180944981352363, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 133.36766052246094, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.673828125, - "step": 396, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.008728504180908 - }, - { - "episode": 6368, - "epoch": 0.03815412637356053, - "loss/policy_avg": 0.1340530812740326, - "lr": 9.746293456032721e-06, - "objective/entropy": -13.48861312866211, - "objective/kl": 24.147233963012695, - "objective/non_score_reward": -1.2073616981506348, - "objective/rlhf_reward": -3.0961134592692057, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 4.865433216094971, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.787109375, - "step": 397, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0002119541168213 - }, - { - "episode": 6384, - "epoch": 0.03824999101269008, - "loss/policy_avg": 0.036313191056251526, - "lr": 9.745654396728016e-06, - "objective/entropy": -118.45596313476562, - "objective/kl": 26.90463638305664, - "objective/non_score_reward": -1.3452317714691162, - "objective/rlhf_reward": -3.5560982182350864, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 7.153594017028809, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.708984375, - "step": 398, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0009984970092773 - }, - { - "episode": 6400, - "epoch": 0.03834585565181963, - "loss/policy_avg": 0.07543957978487015, - "lr": 9.745015337423313e-06, - "objective/entropy": 5.307586669921875, - "objective/kl": 29.030933380126953, - "objective/non_score_reward": -1.4515466690063477, - "objective/rlhf_reward": -2.88246778094885, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 14.018705368041992, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.611328125, - "step": 399, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9984686374664307 - }, - { - "episode": 6416, - "epoch": 0.03844172029094918, - "loss/policy_avg": 0.11864852905273438, - "lr": 9.74437627811861e-06, - "objective/entropy": 10.484695434570312, - "objective/kl": 24.462554931640625, - "objective/non_score_reward": -1.2231277227401733, - "objective/rlhf_reward": -3.376739227565464, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 14.423017501831055, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.685546875, - "step": 400, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9988946914672852 - }, - { - "episode": 6432, - "epoch": 0.03853758493007873, - "loss/policy_avg": -0.036792345345020294, - "lr": 9.743737218813907e-06, - "objective/entropy": -181.87400817871094, - "objective/kl": 23.07555389404297, - "objective/non_score_reward": -1.153777837753296, - "objective/rlhf_reward": -3.191279132564632, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 20.132736206054688, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.708984375, - "step": 401, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.00081729888916 - }, - { - "episode": 6448, - "epoch": 0.03863344956920828, - "loss/policy_avg": 0.22927281260490417, - "lr": 9.743098159509204e-06, - "objective/entropy": -88.96450805664062, - "objective/kl": 32.569129943847656, - "objective/non_score_reward": -1.6284565925598145, - "objective/rlhf_reward": -4.780492917696634, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 49.499900817871094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70703125, - "step": 402, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982388019561768 - }, - { - "episode": 6464, - "epoch": 0.03872931420833783, - "loss/policy_avg": 0.30984753370285034, - "lr": 9.7424591002045e-06, - "objective/entropy": -18.365474700927734, - "objective/kl": 31.77776336669922, - "objective/non_score_reward": -1.5888882875442505, - "objective/rlhf_reward": -5.030040267735643, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 36.973690032958984, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.607421875, - "step": 403, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9970709085464478 - }, - { - "episode": 6480, - "epoch": 0.038825178847467376, - "loss/policy_avg": 0.06557717174291611, - "lr": 9.741820040899796e-06, - "objective/entropy": -141.13568115234375, - "objective/kl": 28.107177734375, - "objective/non_score_reward": -1.405358910560608, - "objective/rlhf_reward": -3.674024294094975, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 31.192813873291016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.791015625, - "step": 404, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9952688217163086 - }, - { - "episode": 6496, - "epoch": 0.038921043486596925, - "loss/policy_avg": 0.05502002686262131, - "lr": 9.741180981595093e-06, - "objective/entropy": 32.80726623535156, - "objective/kl": 44.297119140625, - "objective/non_score_reward": -2.2148561477661133, - "objective/rlhf_reward": -7.5001741287454795, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 19.57358169555664, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.650390625, - "step": 405, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999144077301025 - }, - { - "episode": 6512, - "epoch": 0.039016908125726474, - "loss/policy_avg": 0.026680059731006622, - "lr": 9.74054192229039e-06, - "objective/entropy": 119.29817962646484, - "objective/kl": 39.39287567138672, - "objective/non_score_reward": -1.9696437120437622, - "objective/rlhf_reward": -6.536938837080627, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 0.6370775699615479, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6171875, - "step": 406, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0018253326416016 - }, - { - "episode": 6528, - "epoch": 0.03911277276485602, - "loss/policy_avg": 0.6271831393241882, - "lr": 9.739902862985686e-06, - "objective/entropy": 6.752727508544922, - "objective/kl": 25.43050765991211, - "objective/non_score_reward": -1.2715253829956055, - "objective/rlhf_reward": -5.086101770401001, - "objective/scores": 0.0, - "policy/approxkl_avg": 17.81015396118164, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.74609375, - "step": 407, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977548122406006 - }, - { - "episode": 6544, - "epoch": 0.03920863740398557, - "loss/policy_avg": 0.30096232891082764, - "lr": 9.739263803680983e-06, - "objective/entropy": -24.516462326049805, - "objective/kl": 38.53913116455078, - "objective/non_score_reward": -1.9269566535949707, - "objective/rlhf_reward": -5.585120143667732, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 15.016406059265137, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.603515625, - "step": 408, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.994694709777832 - }, - { - "episode": 6560, - "epoch": 0.03930450204311512, - "loss/policy_avg": 0.03762083500623703, - "lr": 9.73862474437628e-06, - "objective/entropy": -218.5489501953125, - "objective/kl": 26.699615478515625, - "objective/non_score_reward": -1.3349807262420654, - "objective/rlhf_reward": -3.6780635170346363, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 59.4561653137207, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.54296875, - "step": 409, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982738494873047 - }, - { - "episode": 6576, - "epoch": 0.03940036668224467, - "loss/policy_avg": 0.2932765483856201, - "lr": 9.737985685071575e-06, - "objective/entropy": -25.477672576904297, - "objective/kl": 35.529788970947266, - "objective/non_score_reward": -1.776489496231079, - "objective/rlhf_reward": -5.372624413172403, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 39.98287582397461, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.55859375, - "step": 410, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999699354171753 - }, - { - "episode": 6592, - "epoch": 0.03949623132137422, - "loss/policy_avg": -0.2486688196659088, - "lr": 9.737346625766872e-06, - "objective/entropy": -12.952373504638672, - "objective/kl": 33.62919616699219, - "objective/non_score_reward": -1.681459903717041, - "objective/rlhf_reward": -4.778428207116063, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 28.626731872558594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.56640625, - "step": 411, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.003122568130493 - }, - { - "episode": 6608, - "epoch": 0.03959209596050377, - "loss/policy_avg": 0.3249208629131317, - "lr": 9.736707566462167e-06, - "objective/entropy": -52.927459716796875, - "objective/kl": 33.82263946533203, - "objective/non_score_reward": -1.6911320686340332, - "objective/rlhf_reward": -4.364528393745422, - "objective/scores": 0.6, - "policy/approxkl_avg": 41.674591064453125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.791015625, - "step": 412, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000582695007324 - }, - { - "episode": 6624, - "epoch": 0.039687960599633317, - "loss/policy_avg": 0.15019002556800842, - "lr": 9.736068507157464e-06, - "objective/entropy": -22.71458625793457, - "objective/kl": 32.99541473388672, - "objective/non_score_reward": -1.6497704982757568, - "objective/rlhf_reward": -5.257446458845763, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 6.256417274475098, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484375, - "step": 413, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978971481323242 - }, - { - "episode": 6640, - "epoch": 0.039783825238762865, - "loss/policy_avg": 0.296099990606308, - "lr": 9.735429447852761e-06, - "objective/entropy": -10.485054016113281, - "objective/kl": 28.53786277770996, - "objective/non_score_reward": -1.4268931150436401, - "objective/rlhf_reward": -3.9742393652598063, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 4.458545684814453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.703125, - "step": 414, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9996311664581299 - }, - { - "episode": 6656, - "epoch": 0.039879689877892414, - "loss/policy_avg": 0.3615366816520691, - "lr": 9.734790388548058e-06, - "objective/entropy": -102.9046859741211, - "objective/kl": 19.901390075683594, - "objective/non_score_reward": -0.9950695037841797, - "objective/rlhf_reward": -2.3184185675984486, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 18.427024841308594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.82421875, - "step": 415, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999051809310913 - }, - { - "episode": 6672, - "epoch": 0.03997555451702196, - "loss/policy_avg": 0.14772659540176392, - "lr": 9.734151329243355e-06, - "objective/entropy": -148.49395751953125, - "objective/kl": 26.190744400024414, - "objective/non_score_reward": -1.3095372915267944, - "objective/rlhf_reward": -3.6340291834512524, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 59.936073303222656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.806640625, - "step": 416, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001544237136841 - }, - { - "episode": 6688, - "epoch": 0.04007141915615151, - "loss/policy_avg": 0.23557257652282715, - "lr": 9.73351226993865e-06, - "objective/entropy": -145.32284545898438, - "objective/kl": 30.992046356201172, - "objective/non_score_reward": -1.5496025085449219, - "objective/rlhf_reward": -3.7984096765518185, - "objective/scores": 0.6, - "policy/approxkl_avg": 7.065143585205078, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.904296875, - "step": 417, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989118576049805 - }, - { - "episode": 6704, - "epoch": 0.04016728379528106, - "loss/policy_avg": 0.12179827690124512, - "lr": 9.732873210633947e-06, - "objective/entropy": -64.65836334228516, - "objective/kl": 35.22796630859375, - "objective/non_score_reward": -1.7613983154296875, - "objective/rlhf_reward": -5.686343335841579, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 61.170570373535156, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5546875, - "step": 418, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985511302947998 - }, - { - "episode": 6720, - "epoch": 0.04026314843441061, - "loss/policy_avg": -0.043803490698337555, - "lr": 9.732234151329244e-06, - "objective/entropy": -87.70707702636719, - "objective/kl": 28.95832061767578, - "objective/non_score_reward": -1.447916030883789, - "objective/rlhf_reward": -4.275892340930637, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.8885676860809326, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.763671875, - "step": 419, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9999752044677734 - }, - { - "episode": 6736, - "epoch": 0.04035901307354016, - "loss/policy_avg": 0.18042519688606262, - "lr": 9.73159509202454e-06, - "objective/entropy": -4.936176300048828, - "objective/kl": 30.613567352294922, - "objective/non_score_reward": -1.5306785106658936, - "objective/rlhf_reward": -4.722713804244995, - "objective/scores": 0.35, - "policy/approxkl_avg": 209.10888671875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.619140625, - "step": 420, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9993661642074585 - }, - { - "episode": 6752, - "epoch": 0.04045487771266971, - "loss/policy_avg": 0.6567588448524475, - "lr": 9.730956032719838e-06, - "objective/entropy": -162.10116577148438, - "objective/kl": 33.140079498291016, - "objective/non_score_reward": -1.6570039987564087, - "objective/rlhf_reward": -4.505309881941352, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 33.703067779541016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7578125, - "step": 421, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9999632835388184 - }, - { - "episode": 6768, - "epoch": 0.04055074235179926, - "loss/policy_avg": 0.5961964130401611, - "lr": 9.730316973415135e-06, - "objective/entropy": 18.374740600585938, - "objective/kl": 36.82442092895508, - "objective/non_score_reward": -1.8412209749221802, - "objective/rlhf_reward": -4.441164646984312, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 62.1960334777832, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.654296875, - "step": 422, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9999306201934814 - }, - { - "episode": 6784, - "epoch": 0.040646606990928806, - "loss/policy_avg": 0.19755011796951294, - "lr": 9.72967791411043e-06, - "objective/entropy": -57.290000915527344, - "objective/kl": 30.764808654785156, - "objective/non_score_reward": -1.5382404327392578, - "objective/rlhf_reward": -4.811326077490478, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 37.60175323486328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.67578125, - "step": 423, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990873336791992 - }, - { - "episode": 6800, - "epoch": 0.040742471630058355, - "loss/policy_avg": 0.2760317325592041, - "lr": 9.729038854805727e-06, - "objective/entropy": -54.2406005859375, - "objective/kl": 28.681961059570312, - "objective/non_score_reward": -1.4340981245040894, - "objective/rlhf_reward": -3.7889812094735458, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 13.514376640319824, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.548828125, - "step": 424, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0004029273986816 - }, - { - "episode": 6816, - "epoch": 0.040838336269187904, - "loss/policy_avg": 0.05885821580886841, - "lr": 9.728399795501023e-06, - "objective/entropy": -30.280364990234375, - "objective/kl": 31.102825164794922, - "objective/non_score_reward": -1.5551413297653198, - "objective/rlhf_reward": -4.820565319061279, - "objective/scores": 0.35, - "policy/approxkl_avg": 61.290470123291016, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.767578125, - "step": 425, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9986295700073242 - }, - { - "episode": 6832, - "epoch": 0.04093420090831745, - "loss/policy_avg": 0.044344570487737656, - "lr": 9.72776073619632e-06, - "objective/entropy": -223.16510009765625, - "objective/kl": 11.546382904052734, - "objective/non_score_reward": -0.5773191452026367, - "objective/rlhf_reward": -0.3618654114770252, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.5684561729431152, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7109375, - "step": 426, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0002474784851074 - }, - { - "episode": 6848, - "epoch": 0.041030065547447, - "loss/policy_avg": 0.11938305199146271, - "lr": 9.727121676891617e-06, - "objective/entropy": -84.6756362915039, - "objective/kl": 32.253173828125, - "objective/non_score_reward": -1.6126585006713867, - "objective/rlhf_reward": -5.000035624118194, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 54.86524963378906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.693359375, - "step": 427, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9985135793685913 - }, - { - "episode": 6864, - "epoch": 0.04112593018657655, - "loss/policy_avg": -0.02704887092113495, - "lr": 9.726482617586912e-06, - "objective/entropy": 61.31664276123047, - "objective/kl": 50.535186767578125, - "objective/non_score_reward": -2.526759624481201, - "objective/rlhf_reward": -8.765402606039672, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 87.70621490478516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4794921875, - "step": 428, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0009117126464844 - }, - { - "episode": 6880, - "epoch": 0.0412217948257061, - "loss/policy_avg": 0.3563253581523895, - "lr": 9.72584355828221e-06, - "objective/entropy": -201.59555053710938, - "objective/kl": 26.542133331298828, - "objective/non_score_reward": -1.3271067142486572, - "objective/rlhf_reward": -2.384707783104154, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 12.606565475463867, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.60546875, - "step": 429, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9991586208343506 - }, - { - "episode": 6896, - "epoch": 0.04131765946483565, - "loss/policy_avg": 0.3849369287490845, - "lr": 9.725204498977506e-06, - "objective/entropy": -172.11151123046875, - "objective/kl": 31.27842140197754, - "objective/non_score_reward": -1.5639209747314453, - "objective/rlhf_reward": -4.52235098282496, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 35.41864776611328, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.794921875, - "step": 430, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9977457523345947 - }, - { - "episode": 6912, - "epoch": 0.0414135241039652, - "loss/policy_avg": 0.5410929918289185, - "lr": 9.724565439672803e-06, - "objective/entropy": -53.43696594238281, - "objective/kl": 36.75939178466797, - "objective/non_score_reward": -1.8379695415496826, - "objective/rlhf_reward": -5.229172053114448, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 11.017414093017578, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.51953125, - "step": 431, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9961919784545898 - }, - { - "episode": 6928, - "epoch": 0.041509388743094754, - "loss/policy_avg": 0.5185568332672119, - "lr": 9.7239263803681e-06, - "objective/entropy": -42.49586486816406, - "objective/kl": 31.465147018432617, - "objective/non_score_reward": -1.5732574462890625, - "objective/rlhf_reward": -4.914427437869412, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 1.669852614402771, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.62109375, - "step": 432, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9998645782470703 - }, - { - "episode": 6944, - "epoch": 0.0416052533822243, - "loss/policy_avg": -0.09886922687292099, - "lr": 9.723287321063397e-06, - "objective/entropy": -182.28286743164062, - "objective/kl": 27.1431884765625, - "objective/non_score_reward": -1.3571594953536987, - "objective/rlhf_reward": -3.6953046480814615, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 25.096237182617188, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70703125, - "step": 433, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0018882751464844 - }, - { - "episode": 6960, - "epoch": 0.04170111802135385, - "loss/policy_avg": 0.39349502325057983, - "lr": 9.722648261758692e-06, - "objective/entropy": 28.20358657836914, - "objective/kl": 38.92597198486328, - "objective/non_score_reward": -1.946298599243164, - "objective/rlhf_reward": -6.385194158554077, - "objective/scores": 0.35, - "policy/approxkl_avg": 46.153385162353516, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4912109375, - "step": 434, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992406368255615 - }, - { - "episode": 6976, - "epoch": 0.0417969826604834, - "loss/policy_avg": 0.3586619198322296, - "lr": 9.722009202453989e-06, - "objective/entropy": -126.02680206298828, - "objective/kl": 32.40974807739258, - "objective/non_score_reward": -1.6204874515533447, - "objective/rlhf_reward": -4.534538338856633, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 10.944326400756836, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.666015625, - "step": 435, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9971150159835815 - }, - { - "episode": 6992, - "epoch": 0.04189284729961295, - "loss/policy_avg": -0.4687817692756653, - "lr": 9.721370143149284e-06, - "objective/entropy": -69.42359924316406, - "objective/kl": 20.10685157775879, - "objective/non_score_reward": -1.0053426027297974, - "objective/rlhf_reward": -2.6427683430291236, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 22.483867645263672, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.6328125, - "step": 436, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.041172504425049 - }, - { - "episode": 7008, - "epoch": 0.0419887119387425, - "loss/policy_avg": 0.0906272605061531, - "lr": 9.720731083844581e-06, - "objective/entropy": -149.47274780273438, - "objective/kl": 26.28115463256836, - "objective/non_score_reward": -1.3140578269958496, - "objective/rlhf_reward": -3.1335249564805365, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 2.7223973274230957, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.599609375, - "step": 437, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000173568725586 - }, - { - "episode": 7024, - "epoch": 0.04208457657787205, - "loss/policy_avg": 0.3348531126976013, - "lr": 9.720092024539878e-06, - "objective/entropy": 22.56686782836914, - "objective/kl": 36.523582458496094, - "objective/non_score_reward": -1.8261791467666626, - "objective/rlhf_reward": -5.700596723620015, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 20.443164825439453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.515625, - "step": 438, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9979515075683594 - }, - { - "episode": 7040, - "epoch": 0.042180441217001596, - "loss/policy_avg": 0.04725319519639015, - "lr": 9.719452965235175e-06, - "objective/entropy": -71.08361053466797, - "objective/kl": 20.915573120117188, - "objective/non_score_reward": -1.045778751373291, - "objective/rlhf_reward": -2.0604087731995917, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 7.088305473327637, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4462890625, - "step": 439, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0024590492248535 - }, - { - "episode": 7056, - "epoch": 0.042276305856131145, - "loss/policy_avg": 0.18381188809871674, - "lr": 9.718813905930472e-06, - "objective/entropy": 25.569873809814453, - "objective/kl": 38.07762145996094, - "objective/non_score_reward": -1.9038809537887573, - "objective/rlhf_reward": -3.215523815155029, - "objective/scores": 1.1, - "policy/approxkl_avg": 30.962854385375977, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.62890625, - "step": 440, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0011448860168457 - }, - { - "episode": 7072, - "epoch": 0.042372170495260694, - "loss/policy_avg": 0.1967303454875946, - "lr": 9.718174846625767e-06, - "objective/entropy": -103.38803100585938, - "objective/kl": 29.222076416015625, - "objective/non_score_reward": -1.4611037969589233, - "objective/rlhf_reward": -4.240295205179768, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 8.899417877197266, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.689453125, - "step": 441, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9986486434936523 - }, - { - "episode": 7088, - "epoch": 0.04246803513439024, - "loss/policy_avg": -0.07635466754436493, - "lr": 9.717535787321064e-06, - "objective/entropy": -54.58887481689453, - "objective/kl": 35.043663024902344, - "objective/non_score_reward": -1.752183198928833, - "objective/rlhf_reward": -5.527779820378184, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 12.18149185180664, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.580078125, - "step": 442, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0013041496276855 - }, - { - "episode": 7104, - "epoch": 0.04256389977351979, - "loss/policy_avg": 0.3104819059371948, - "lr": 9.71689672801636e-06, - "objective/entropy": -53.842830657958984, - "objective/kl": 23.18008804321289, - "objective/non_score_reward": -1.1590044498443604, - "objective/rlhf_reward": -3.0797587921291143, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 86.82899475097656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.91796875, - "step": 443, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9991846084594727 - }, - { - "episode": 7120, - "epoch": 0.04265976441264934, - "loss/policy_avg": 0.6317604780197144, - "lr": 9.716257668711657e-06, - "objective/entropy": -21.19356918334961, - "objective/kl": 30.069751739501953, - "objective/non_score_reward": -1.503487467765808, - "objective/rlhf_reward": -4.6353477025903285, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 128.40951538085938, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.76953125, - "step": 444, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.997659683227539 - }, - { - "episode": 7136, - "epoch": 0.04275562905177889, - "loss/policy_avg": 0.33194229006767273, - "lr": 9.715618609406954e-06, - "objective/entropy": -102.48907470703125, - "objective/kl": 32.374549865722656, - "objective/non_score_reward": -1.6187275648117065, - "objective/rlhf_reward": -6.474910318851471, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.681756973266602, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.59375, - "step": 445, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998705506324768 - }, - { - "episode": 7152, - "epoch": 0.04285149369090844, - "loss/policy_avg": 0.26850253343582153, - "lr": 9.714979550102251e-06, - "objective/entropy": 69.35136413574219, - "objective/kl": 26.097612380981445, - "objective/non_score_reward": -1.3048806190490723, - "objective/rlhf_reward": -3.738569977696299, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 62.56462097167969, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6640625, - "step": 446, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99786376953125 - }, - { - "episode": 7168, - "epoch": 0.04294735833003799, - "loss/policy_avg": -0.1885017603635788, - "lr": 9.714340490797546e-06, - "objective/entropy": -16.98421859741211, - "objective/kl": 30.90627670288086, - "objective/non_score_reward": -1.5453139543533325, - "objective/rlhf_reward": -4.577135715548115, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 11.766645431518555, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.73828125, - "step": 447, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.003262996673584 - }, - { - "episode": 7184, - "epoch": 0.04304322296916754, - "loss/policy_avg": 0.24147900938987732, - "lr": 9.713701431492843e-06, - "objective/entropy": -196.87869262695312, - "objective/kl": 23.231670379638672, - "objective/non_score_reward": -1.161583662033081, - "objective/rlhf_reward": -3.1305624780976142, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 19.03369903564453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69140625, - "step": 448, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9996755123138428 - }, - { - "episode": 7200, - "epoch": 0.043139087608297086, - "loss/policy_avg": 0.3051467537879944, - "lr": 9.71306237218814e-06, - "objective/entropy": -54.2137565612793, - "objective/kl": 33.54918670654297, - "objective/non_score_reward": -1.6774592399597168, - "objective/rlhf_reward": -5.047977810323822, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 74.37176513671875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.783203125, - "step": 449, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9967325925827026 - }, - { - "episode": 7216, - "epoch": 0.043234952247426635, - "loss/policy_avg": 0.0008301436901092529, - "lr": 9.712423312883437e-06, - "objective/entropy": -37.864322662353516, - "objective/kl": 24.052818298339844, - "objective/non_score_reward": -1.2026410102844238, - "objective/rlhf_reward": -2.9857349946823826, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 1.6498993635177612, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.525390625, - "step": 450, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001569986343384 - }, - { - "episode": 7232, - "epoch": 0.043330816886556184, - "loss/policy_avg": 0.10217726975679398, - "lr": 9.711784253578734e-06, - "objective/entropy": -97.12496948242188, - "objective/kl": 20.143707275390625, - "objective/non_score_reward": -1.007185459136963, - "objective/rlhf_reward": -2.669491672252102, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 37.34214401245117, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.783203125, - "step": 451, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9993760585784912 - }, - { - "episode": 7248, - "epoch": 0.04342668152568573, - "loss/policy_avg": 0.2181258350610733, - "lr": 9.711145194274029e-06, - "objective/entropy": -187.07266235351562, - "objective/kl": 22.520824432373047, - "objective/non_score_reward": -1.1260414123535156, - "objective/rlhf_reward": -2.9000454283395585, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 80.40426635742188, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.654296875, - "step": 452, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000328540802002 - }, - { - "episode": 7264, - "epoch": 0.04352254616481528, - "loss/policy_avg": 0.28700706362724304, - "lr": 9.710506134969326e-06, - "objective/entropy": -119.91871643066406, - "objective/kl": 30.88311004638672, - "objective/non_score_reward": -1.5441553592681885, - "objective/rlhf_reward": -4.834986022024779, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 14.897968292236328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.787109375, - "step": 453, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9972997903823853 - }, - { - "episode": 7280, - "epoch": 0.04361841080394483, - "loss/policy_avg": 0.013649387285113335, - "lr": 9.709867075664623e-06, - "objective/entropy": -137.84861755371094, - "objective/kl": 35.624549865722656, - "objective/non_score_reward": -1.781227707862854, - "objective/rlhf_reward": -7.124910950660706, - "objective/scores": 0.0, - "policy/approxkl_avg": 77.14759826660156, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.712890625, - "step": 454, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999969720840454 - }, - { - "episode": 7296, - "epoch": 0.04371427544307438, - "loss/policy_avg": 0.9055305123329163, - "lr": 9.70922801635992e-06, - "objective/entropy": -177.1896514892578, - "objective/kl": 34.19129943847656, - "objective/non_score_reward": -1.7095649242401123, - "objective/rlhf_reward": -5.387661199183807, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 51.92662811279297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.66796875, - "step": 455, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9976481199264526 - }, - { - "episode": 7312, - "epoch": 0.04381014008220393, - "loss/policy_avg": -0.14486947655677795, - "lr": 9.708588957055215e-06, - "objective/entropy": -91.43609619140625, - "objective/kl": 30.12580108642578, - "objective/non_score_reward": -1.5062901973724365, - "objective/rlhf_reward": -4.509388887675938, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 24.85628890991211, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.611328125, - "step": 456, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.011446952819824 - }, - { - "episode": 7328, - "epoch": 0.04390600472133348, - "loss/policy_avg": 0.3115137815475464, - "lr": 9.707949897750512e-06, - "objective/entropy": -33.496673583984375, - "objective/kl": 24.4674072265625, - "objective/non_score_reward": -1.2233703136444092, - "objective/rlhf_reward": -3.377709650787052, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 9.057685852050781, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.751953125, - "step": 457, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0009520053863525 - }, - { - "episode": 7344, - "epoch": 0.044001869360463026, - "loss/policy_avg": 1.4892723560333252, - "lr": 9.707310838445809e-06, - "objective/entropy": -35.618934631347656, - "objective/kl": 27.64456558227539, - "objective/non_score_reward": -1.3822282552719116, - "objective/rlhf_reward": -3.5815017921494796, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 1.899414300918579, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.57421875, - "step": 458, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9999825954437256 - }, - { - "episode": 7360, - "epoch": 0.044097733999592575, - "loss/policy_avg": 0.022264737635850906, - "lr": 9.706671779141105e-06, - "objective/entropy": 31.060089111328125, - "objective/kl": 34.85979461669922, - "objective/non_score_reward": -1.7429897785186768, - "objective/rlhf_reward": -5.367839369837361, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 7.1077799797058105, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.751953125, - "step": 459, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9993565082550049 - }, - { - "episode": 7376, - "epoch": 0.044193598638722124, - "loss/policy_avg": 0.08219340443611145, - "lr": 9.7060327198364e-06, - "objective/entropy": -69.6414566040039, - "objective/kl": 35.42669677734375, - "objective/non_score_reward": -1.7713346481323242, - "objective/rlhf_reward": -5.726088785861416, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 21.27887535095215, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.712890625, - "step": 460, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0088043212890625 - }, - { - "episode": 7392, - "epoch": 0.04428946327785167, - "loss/policy_avg": 0.03685396909713745, - "lr": 9.705393660531698e-06, - "objective/entropy": -245.04380798339844, - "objective/kl": 21.42380142211914, - "objective/non_score_reward": -1.0711899995803833, - "objective/rlhf_reward": -2.1620538852372504, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 1.849046230316162, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.767578125, - "step": 461, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.008730411529541 - }, - { - "episode": 7408, - "epoch": 0.04438532791698122, - "loss/policy_avg": 0.5492111444473267, - "lr": 9.704754601226994e-06, - "objective/entropy": 9.25466537475586, - "objective/kl": 20.997167587280273, - "objective/non_score_reward": -1.0498583316802979, - "objective/rlhf_reward": -1.2757146700632302, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 36.03380584716797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.9296875, - "step": 462, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000826120376587 - }, - { - "episode": 7424, - "epoch": 0.04448119255611077, - "loss/policy_avg": 0.22961178421974182, - "lr": 9.704115541922291e-06, - "objective/entropy": -2.9236984252929688, - "objective/kl": 26.89717674255371, - "objective/non_score_reward": -1.3448588848114014, - "objective/rlhf_reward": -3.717576061905013, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 133.2696075439453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8984375, - "step": 463, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999076008796692 - }, - { - "episode": 7440, - "epoch": 0.04457705719524032, - "loss/policy_avg": 0.1330358386039734, - "lr": 9.703476482617588e-06, - "objective/entropy": -155.3049774169922, - "objective/kl": 32.32700729370117, - "objective/non_score_reward": -1.6163502931594849, - "objective/rlhf_reward": -2.0654012918472286, - "objective/scores": 1.1, - "policy/approxkl_avg": 352.436767578125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.86328125, - "step": 464, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9973843097686768 - }, - { - "episode": 7456, - "epoch": 0.04467292183436987, - "loss/policy_avg": 0.13191767036914825, - "lr": 9.702837423312883e-06, - "objective/entropy": -130.06350708007812, - "objective/kl": 31.98480987548828, - "objective/non_score_reward": -1.5992405414581299, - "objective/rlhf_reward": -5.07144889596097, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 4.149503707885742, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.654296875, - "step": 465, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9979965686798096 - }, - { - "episode": 7472, - "epoch": 0.04476878647349942, - "loss/policy_avg": 0.11230316013097763, - "lr": 9.70219836400818e-06, - "objective/entropy": 11.579151153564453, - "objective/kl": 34.1675910949707, - "objective/non_score_reward": -1.7083796262741089, - "objective/rlhf_reward": -5.3525657681778664, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 13.865779876708984, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.708984375, - "step": 466, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.00162410736084 - }, - { - "episode": 7488, - "epoch": 0.04486465111262897, - "loss/policy_avg": 0.2810555398464203, - "lr": 9.701559304703477e-06, - "objective/entropy": -138.13914489746094, - "objective/kl": 22.91815948486328, - "objective/non_score_reward": -1.145907998085022, - "objective/rlhf_reward": -3.205029585448605, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 97.98136901855469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.775390625, - "step": 467, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9984302520751953 - }, - { - "episode": 7504, - "epoch": 0.044960515751758516, - "loss/policy_avg": -0.09679757058620453, - "lr": 9.700920245398774e-06, - "objective/entropy": -44.23152160644531, - "objective/kl": 34.52162170410156, - "objective/non_score_reward": -1.726081132888794, - "objective/rlhf_reward": -5.170991019407907, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 12.573694229125977, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.3896484375, - "step": 468, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9995930194854736 - }, - { - "episode": 7520, - "epoch": 0.045056380390888065, - "loss/policy_avg": 0.2740531265735626, - "lr": 9.700281186094071e-06, - "objective/entropy": -64.87997436523438, - "objective/kl": 30.31191062927246, - "objective/non_score_reward": -1.5155954360961914, - "objective/rlhf_reward": -4.329048738876978, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 12.677139282226562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.630859375, - "step": 469, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981954097747803 - }, - { - "episode": 7536, - "epoch": 0.045152245030017614, - "loss/policy_avg": 0.4849107265472412, - "lr": 9.699642126789368e-06, - "objective/entropy": -136.48355102539062, - "objective/kl": 20.618619918823242, - "objective/non_score_reward": -1.030930995941162, - "objective/rlhf_reward": -2.6998918845253863, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 95.56924438476562, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.607421875, - "step": 470, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9975948333740234 - }, - { - "episode": 7552, - "epoch": 0.04524810966914716, - "loss/policy_avg": 0.05032477527856827, - "lr": 9.699003067484663e-06, - "objective/entropy": -116.99330139160156, - "objective/kl": 31.927814483642578, - "objective/non_score_reward": -1.596390724182129, - "objective/rlhf_reward": -5.026312672828121, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.1943883895874023, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.744140625, - "step": 471, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0004005432128906 - }, - { - "episode": 7568, - "epoch": 0.04534397430827671, - "loss/policy_avg": 0.23768550157546997, - "lr": 9.69836400817996e-06, - "objective/entropy": -56.441200256347656, - "objective/kl": 35.956565856933594, - "objective/non_score_reward": -1.7978280782699585, - "objective/rlhf_reward": -5.587192330423909, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 18.25104522705078, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.75390625, - "step": 472, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001668930053711 - }, - { - "episode": 7584, - "epoch": 0.04543983894740626, - "loss/policy_avg": 0.18428431451320648, - "lr": 9.697724948875257e-06, - "objective/entropy": -12.911811828613281, - "objective/kl": 31.440038681030273, - "objective/non_score_reward": -1.5720020532608032, - "objective/rlhf_reward": -4.554674939314523, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 33.68145751953125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.673828125, - "step": 473, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9997875690460205 - }, - { - "episode": 7600, - "epoch": 0.04553570358653581, - "loss/policy_avg": 1.0267724990844727, - "lr": 9.697085889570554e-06, - "objective/entropy": -155.81759643554688, - "objective/kl": 15.551814079284668, - "objective/non_score_reward": -0.7775906920433044, - "objective/rlhf_reward": -1.7317606593049586, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 3.7084851264953613, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.673828125, - "step": 474, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998857975006104 - }, - { - "episode": 7616, - "epoch": 0.04563156822566536, - "loss/policy_avg": 0.5301028490066528, - "lr": 9.69644683026585e-06, - "objective/entropy": -186.65789794921875, - "objective/kl": 37.16144561767578, - "objective/non_score_reward": -1.858072280883789, - "objective/rlhf_reward": -5.876029699054316, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 48.150047302246094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 475, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9972370862960815 - }, - { - "episode": 7632, - "epoch": 0.04572743286479491, - "loss/policy_avg": 0.2144310474395752, - "lr": 9.695807770961146e-06, - "objective/entropy": -153.16233825683594, - "objective/kl": 31.742645263671875, - "objective/non_score_reward": -1.5871323347091675, - "objective/rlhf_reward": -4.832757556232151, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 43.260581970214844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62109375, - "step": 476, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.996619701385498 - }, - { - "episode": 7648, - "epoch": 0.04582329750392446, - "loss/policy_avg": 0.1423683762550354, - "lr": 9.695168711656443e-06, - "objective/entropy": -101.34695434570312, - "objective/kl": 34.40277099609375, - "objective/non_score_reward": -1.7201385498046875, - "objective/rlhf_reward": -5.555040988951845, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 6.133903503417969, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.740234375, - "step": 477, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9991512298583984 - }, - { - "episode": 7664, - "epoch": 0.045919162143054006, - "loss/policy_avg": -0.20567180216312408, - "lr": 9.694529652351738e-06, - "objective/entropy": 1.8477153778076172, - "objective/kl": 34.25542068481445, - "objective/non_score_reward": -1.7127711772918701, - "objective/rlhf_reward": -5.451084411144256, - "objective/scores": 0.35, - "policy/approxkl_avg": 90.96925354003906, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.79296875, - "step": 478, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978113174438477 - }, - { - "episode": 7680, - "epoch": 0.046015026782183555, - "loss/policy_avg": 0.04285082221031189, - "lr": 9.693890593047035e-06, - "objective/entropy": -163.51800537109375, - "objective/kl": 39.76237487792969, - "objective/non_score_reward": -1.9881186485290527, - "objective/rlhf_reward": -6.47152245324409, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 41.795677185058594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.64453125, - "step": 479, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9989352226257324 - }, - { - "episode": 7696, - "epoch": 0.046110891421313104, - "loss/policy_avg": 0.30679094791412354, - "lr": 9.693251533742331e-06, - "objective/entropy": -137.21139526367188, - "objective/kl": 24.817203521728516, - "objective/non_score_reward": -1.2408602237701416, - "objective/rlhf_reward": -3.407181172576502, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 7.010622024536133, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.740234375, - "step": 480, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998010516166687 - }, - { - "episode": 7712, - "epoch": 0.04620675606044265, - "loss/policy_avg": 0.14935311675071716, - "lr": 9.692612474437628e-06, - "objective/entropy": -133.61581420898438, - "objective/kl": 28.18117904663086, - "objective/non_score_reward": -1.4090590476989746, - "objective/rlhf_reward": -4.276986324523373, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 41.72409439086914, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.720703125, - "step": 481, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9976277351379395 - }, - { - "episode": 7728, - "epoch": 0.0463026206995722, - "loss/policy_avg": 0.4503282606601715, - "lr": 9.691973415132925e-06, - "objective/entropy": -185.92971801757812, - "objective/kl": 24.44643783569336, - "objective/non_score_reward": -1.22232186794281, - "objective/rlhf_reward": -4.88928747177124, - "objective/scores": 0.0, - "policy/approxkl_avg": 26.91709327697754, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.55078125, - "step": 482, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986282587051392 - }, - { - "episode": 7744, - "epoch": 0.04639848533870175, - "loss/policy_avg": 0.7586182355880737, - "lr": 9.691334355828222e-06, - "objective/entropy": -136.83555603027344, - "objective/kl": 27.66883087158203, - "objective/non_score_reward": -1.38344144821167, - "objective/rlhf_reward": -3.41105959035543, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 39.446250915527344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4287109375, - "step": 483, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9947845935821533 - }, - { - "episode": 7760, - "epoch": 0.0464943499778313, - "loss/policy_avg": 0.47291696071624756, - "lr": 9.690695296523517e-06, - "objective/entropy": 10.135929107666016, - "objective/kl": 31.171567916870117, - "objective/non_score_reward": -1.558578372001648, - "objective/rlhf_reward": -4.572453921259033, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 15.718633651733398, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.611328125, - "step": 484, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.997343897819519 - }, - { - "episode": 7776, - "epoch": 0.04659021461696085, - "loss/policy_avg": 0.19839856028556824, - "lr": 9.690056237218814e-06, - "objective/entropy": -64.7506332397461, - "objective/kl": 25.45448112487793, - "objective/non_score_reward": -1.2727241516113281, - "objective/rlhf_reward": -2.690896427631378, - "objective/scores": 0.6, - "policy/approxkl_avg": 29.054779052734375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.76953125, - "step": 485, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977914094924927 - }, - { - "episode": 7792, - "epoch": 0.0466860792560904, - "loss/policy_avg": 0.16692940890789032, - "lr": 9.689417177914111e-06, - "objective/entropy": -200.1573028564453, - "objective/kl": 16.24359893798828, - "objective/non_score_reward": -0.8121800422668457, - "objective/rlhf_reward": -1.6446000672021683, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 3.7478506565093994, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.58984375, - "step": 486, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9997503757476807 - }, - { - "episode": 7808, - "epoch": 0.046781943895219946, - "loss/policy_avg": 0.20832450687885284, - "lr": 9.688778118609408e-06, - "objective/entropy": -229.8734893798828, - "objective/kl": 24.610809326171875, - "objective/non_score_reward": -1.2305405139923096, - "objective/rlhf_reward": -3.3180417156854443, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 50.22547912597656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 487, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9977538585662842 - }, - { - "episode": 7824, - "epoch": 0.046877808534349495, - "loss/policy_avg": 0.584824800491333, - "lr": 9.688139059304705e-06, - "objective/entropy": -159.94088745117188, - "objective/kl": 32.78782653808594, - "objective/non_score_reward": -1.6393911838531494, - "objective/rlhf_reward": -5.041793072017368, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 53.52165985107422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4560546875, - "step": 488, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9974932670593262 - }, - { - "episode": 7840, - "epoch": 0.046973673173479044, - "loss/policy_avg": 0.10657641291618347, - "lr": 9.6875e-06, - "objective/entropy": -117.46031188964844, - "objective/kl": 22.680068969726562, - "objective/non_score_reward": -1.1340034008026123, - "objective/rlhf_reward": -2.802680269877116, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 31.437467575073242, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.640625, - "step": 489, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9984140396118164 - }, - { - "episode": 7856, - "epoch": 0.0470695378126086, - "loss/policy_avg": 0.05225694179534912, - "lr": 9.686860940695297e-06, - "objective/entropy": -102.69722747802734, - "objective/kl": 35.890769958496094, - "objective/non_score_reward": -1.7945387363433838, - "objective/rlhf_reward": -2.7781547069549557, - "objective/scores": 1.1, - "policy/approxkl_avg": 8.238727569580078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.693359375, - "step": 490, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.996579885482788 - }, - { - "episode": 7872, - "epoch": 0.04716540245173815, - "loss/policy_avg": 0.3118276000022888, - "lr": 9.686221881390594e-06, - "objective/entropy": -42.73939895629883, - "objective/kl": 22.486095428466797, - "objective/non_score_reward": -1.1243047714233398, - "objective/rlhf_reward": -3.0733869268494525, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 29.32803726196289, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.609375, - "step": 491, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991399049758911 - }, - { - "episode": 7888, - "epoch": 0.0472612670908677, - "loss/policy_avg": 0.621738076210022, - "lr": 9.68558282208589e-06, - "objective/entropy": -26.77874755859375, - "objective/kl": 33.77405548095703, - "objective/non_score_reward": -1.688702940940857, - "objective/rlhf_reward": -5.198552160468653, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 9.273128509521484, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.73046875, - "step": 492, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9988332986831665 - }, - { - "episode": 7904, - "epoch": 0.04735713172999725, - "loss/policy_avg": 0.16049662232398987, - "lr": 9.684943762781188e-06, - "objective/entropy": -84.04755401611328, - "objective/kl": 25.384605407714844, - "objective/non_score_reward": -1.2692303657531738, - "objective/rlhf_reward": -2.1532023891222205, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 0.7223958373069763, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.646484375, - "step": 493, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0005576610565186 - }, - { - "episode": 7920, - "epoch": 0.047452996369126796, - "loss/policy_avg": 0.3413264751434326, - "lr": 9.684304703476484e-06, - "objective/entropy": -118.85188293457031, - "objective/kl": 30.77880859375, - "objective/non_score_reward": -1.5389404296875, - "objective/rlhf_reward": -4.422428623835246, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 19.30898666381836, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.611328125, - "step": 494, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997934103012085 - }, - { - "episode": 7936, - "epoch": 0.047548861008256345, - "loss/policy_avg": -0.016445789486169815, - "lr": 9.68366564417178e-06, - "objective/entropy": -211.39361572265625, - "objective/kl": 26.587682723999023, - "objective/non_score_reward": -1.3293840885162354, - "objective/rlhf_reward": -2.917536354064941, - "objective/scores": 0.6, - "policy/approxkl_avg": 50.449562072753906, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5234375, - "step": 495, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.99893319606781 - }, - { - "episode": 7952, - "epoch": 0.047644725647385894, - "loss/policy_avg": -0.2565712034702301, - "lr": 9.683026584867076e-06, - "objective/entropy": -49.41560363769531, - "objective/kl": 27.722068786621094, - "objective/non_score_reward": -1.3861035108566284, - "objective/rlhf_reward": -3.882554417074309, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 16.277629852294922, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.703125, - "step": 496, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.003061294555664 - }, - { - "episode": 7968, - "epoch": 0.04774059028651544, - "loss/policy_avg": 0.17001637816429138, - "lr": 9.682387525562373e-06, - "objective/entropy": -40.254676818847656, - "objective/kl": 25.527742385864258, - "objective/non_score_reward": -1.2763869762420654, - "objective/rlhf_reward": -5.10554826259613, - "objective/scores": 0.0, - "policy/approxkl_avg": 19.284744262695312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6875, - "step": 497, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9972081184387207 - }, - { - "episode": 7984, - "epoch": 0.04783645492564499, - "loss/policy_avg": 0.08028728514909744, - "lr": 9.68174846625767e-06, - "objective/entropy": -23.79485321044922, - "objective/kl": 23.14282989501953, - "objective/non_score_reward": -1.1571415662765503, - "objective/rlhf_reward": -4.628566324710846, - "objective/scores": 0.0, - "policy/approxkl_avg": 25.781452178955078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4580078125, - "step": 498, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9980249404907227 - }, - { - "episode": 8000, - "epoch": 0.04793231956477454, - "loss/policy_avg": 0.2174569070339203, - "lr": 9.681109406952967e-06, - "objective/entropy": -109.13389587402344, - "objective/kl": 36.64985656738281, - "objective/non_score_reward": -1.8324928283691406, - "objective/rlhf_reward": -5.951369323817593, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 27.508981704711914, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.525390625, - "step": 499, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99745512008667 - }, - { - "episode": 8016, - "epoch": 0.04802818420390409, - "loss/policy_avg": 0.13631635904312134, - "lr": 9.680470347648262e-06, - "objective/entropy": -99.519775390625, - "objective/kl": 41.364810943603516, - "objective/non_score_reward": -2.0682406425476074, - "objective/rlhf_reward": -6.448134417804788, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 102.98858642578125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4970703125, - "step": 500, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998145341873169 - }, - { - "episode": 8032, - "epoch": 0.04812404884303364, - "loss/policy_avg": 0.059351589530706406, - "lr": 9.67983128834356e-06, - "objective/entropy": -226.86756896972656, - "objective/kl": 27.588150024414062, - "objective/non_score_reward": -1.379407525062561, - "objective/rlhf_reward": -4.001858436855015, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 6.536296844482422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.65625, - "step": 501, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9976544380187988 - }, - { - "episode": 8048, - "epoch": 0.04821991348216319, - "loss/policy_avg": 0.5408469438552856, - "lr": 9.679192229038854e-06, - "objective/entropy": 4.518913269042969, - "objective/kl": 37.552825927734375, - "objective/non_score_reward": -1.8776414394378662, - "objective/rlhf_reward": -5.777232364813486, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 8.410907745361328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.787109375, - "step": 502, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9991774559020996 - }, - { - "episode": 8064, - "epoch": 0.048315778121292736, - "loss/policy_avg": 1.089150071144104, - "lr": 9.678553169734151e-06, - "objective/entropy": -70.22102355957031, - "objective/kl": 36.886138916015625, - "objective/non_score_reward": -1.8443071842193604, - "objective/rlhf_reward": -5.254522027746711, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 17.696430206298828, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.662109375, - "step": 503, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9944283962249756 - }, - { - "episode": 8080, - "epoch": 0.048411642760422285, - "loss/policy_avg": 0.04815336689352989, - "lr": 9.677914110429448e-06, - "objective/entropy": -206.61251831054688, - "objective/kl": 19.784542083740234, - "objective/non_score_reward": -0.9892270565032959, - "objective/rlhf_reward": -2.4411365626179538, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 10.987642288208008, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6328125, - "step": 504, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982428550720215 - }, - { - "episode": 8096, - "epoch": 0.048507507399551834, - "loss/policy_avg": 0.4511667788028717, - "lr": 9.677275051124745e-06, - "objective/entropy": -44.11040496826172, - "objective/kl": 32.054603576660156, - "objective/non_score_reward": -1.6027300357818604, - "objective/rlhf_reward": -4.8951483605229225, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 161.647705078125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.57421875, - "step": 505, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9990203380584717 - }, - { - "episode": 8112, - "epoch": 0.04860337203868138, - "loss/policy_avg": 0.43728113174438477, - "lr": 9.676635991820042e-06, - "objective/entropy": -167.46401977539062, - "objective/kl": 25.358474731445312, - "objective/non_score_reward": -1.2679238319396973, - "objective/rlhf_reward": -5.071695148944855, - "objective/scores": 0.0, - "policy/approxkl_avg": 6.505180358886719, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.67578125, - "step": 506, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999653697013855 - }, - { - "episode": 8128, - "epoch": 0.04869923667781093, - "loss/policy_avg": 0.049704909324645996, - "lr": 9.675996932515339e-06, - "objective/entropy": -68.84889221191406, - "objective/kl": 23.506563186645508, - "objective/non_score_reward": -1.1753281354904175, - "objective/rlhf_reward": -3.3227105523027003, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 3.8750853538513184, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.783203125, - "step": 507, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.99936842918396 - }, - { - "episode": 8144, - "epoch": 0.04879510131694048, - "loss/policy_avg": 0.23126532137393951, - "lr": 9.675357873210634e-06, - "objective/entropy": -193.32493591308594, - "objective/kl": 30.975135803222656, - "objective/non_score_reward": -1.5487568378448486, - "objective/rlhf_reward": -4.072320940271888, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 30.721832275390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.599609375, - "step": 508, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.994474172592163 - }, - { - "episode": 8160, - "epoch": 0.04889096595607003, - "loss/policy_avg": 0.6136177778244019, - "lr": 9.67471881390593e-06, - "objective/entropy": 35.12611770629883, - "objective/kl": 24.636138916015625, - "objective/non_score_reward": -1.2318068742752075, - "objective/rlhf_reward": -2.979816268162663, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 31.945526123046875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.736328125, - "step": 509, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001587390899658 - }, - { - "episode": 8176, - "epoch": 0.04898683059519958, - "loss/policy_avg": 0.07654842734336853, - "lr": 9.674079754601228e-06, - "objective/entropy": -218.7822265625, - "objective/kl": 30.072967529296875, - "objective/non_score_reward": -1.5036484003067017, - "objective/rlhf_reward": -3.8918873689332347, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 42.21351623535156, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.763671875, - "step": 510, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9961347579956055 - }, - { - "episode": 8192, - "epoch": 0.04908269523432913, - "loss/policy_avg": 0.4642539322376251, - "lr": 9.673440695296525e-06, - "objective/entropy": -61.26002502441406, - "objective/kl": 28.09502410888672, - "objective/non_score_reward": -1.4047513008117676, - "objective/rlhf_reward": -4.168407420726165, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 28.139495849609375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.50390625, - "step": 511, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9988558292388916 - }, - { - "episode": 8208, - "epoch": 0.04917855987345868, - "loss/policy_avg": -0.1496490240097046, - "lr": 9.672801635991821e-06, - "objective/entropy": -237.9604034423828, - "objective/kl": 24.80710220336914, - "objective/non_score_reward": -1.2403552532196045, - "objective/rlhf_reward": -3.5828184867776454, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 9.494747161865234, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.671875, - "step": 512, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000725507736206 - }, - { - "episode": 8224, - "epoch": 0.049274424512588226, - "loss/policy_avg": -0.18209466338157654, - "lr": 9.672162576687117e-06, - "objective/entropy": -180.66116333007812, - "objective/kl": 25.97962188720703, - "objective/non_score_reward": -1.2989810705184937, - "objective/rlhf_reward": -3.073217930571113, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 41.079193115234375, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.73046875, - "step": 513, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.997527837753296 - }, - { - "episode": 8240, - "epoch": 0.049370289151717775, - "loss/policy_avg": 0.3504701852798462, - "lr": 9.671523517382413e-06, - "objective/entropy": -98.80787658691406, - "objective/kl": 26.576587677001953, - "objective/non_score_reward": -1.3288295269012451, - "objective/rlhf_reward": -0.9153180480003353, - "objective/scores": 1.1, - "policy/approxkl_avg": 13.758487701416016, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6953125, - "step": 514, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.998854160308838 - }, - { - "episode": 8256, - "epoch": 0.049466153790847324, - "loss/policy_avg": 0.48611417412757874, - "lr": 9.67088445807771e-06, - "objective/entropy": -128.45774841308594, - "objective/kl": 29.784334182739258, - "objective/non_score_reward": -1.4892168045043945, - "objective/rlhf_reward": -4.223533527056375, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 1.2566263675689697, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.681640625, - "step": 515, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0001587867736816 - }, - { - "episode": 8272, - "epoch": 0.04956201842997687, - "loss/policy_avg": -0.13057222962379456, - "lr": 9.670245398773007e-06, - "objective/entropy": -146.07781982421875, - "objective/kl": 31.182106018066406, - "objective/non_score_reward": -1.5591052770614624, - "objective/rlhf_reward": -3.8364211082458493, - "objective/scores": 0.6, - "policy/approxkl_avg": 15.76829719543457, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.576171875, - "step": 516, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0000805854797363 - }, - { - "episode": 8288, - "epoch": 0.04965788306910642, - "loss/policy_avg": 0.637583315372467, - "lr": 9.669606339468304e-06, - "objective/entropy": -144.37762451171875, - "objective/kl": 27.648868560791016, - "objective/non_score_reward": -1.3824436664581299, - "objective/rlhf_reward": -4.0140026448094215, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.933715343475342, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.64453125, - "step": 517, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9995014667510986 - }, - { - "episode": 8304, - "epoch": 0.04975374770823597, - "loss/policy_avg": 0.23517751693725586, - "lr": 9.668967280163601e-06, - "objective/entropy": -130.0078125, - "objective/kl": 26.889904022216797, - "objective/non_score_reward": -1.344495415687561, - "objective/rlhf_reward": -3.927383343787536, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 35.43697738647461, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.81640625, - "step": 518, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9984192848205566 - }, - { - "episode": 8320, - "epoch": 0.04984961234736552, - "loss/policy_avg": -0.05650443956255913, - "lr": 9.668328220858896e-06, - "objective/entropy": -214.1605682373047, - "objective/kl": 21.148624420166016, - "objective/non_score_reward": -1.0574312210083008, - "objective/rlhf_reward": -2.673465876784876, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 18.935588836669922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.599609375, - "step": 519, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993261098861694 - }, - { - "episode": 8336, - "epoch": 0.04994547698649507, - "loss/policy_avg": -0.034447960555553436, - "lr": 9.667689161554193e-06, - "objective/entropy": -158.14088439941406, - "objective/kl": 32.29146957397461, - "objective/non_score_reward": -1.61457359790802, - "objective/rlhf_reward": -4.902035086360529, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 6.876145362854004, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58984375, - "step": 520, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9993882179260254 - }, - { - "episode": 8352, - "epoch": 0.05004134162562462, - "loss/policy_avg": -0.13744737207889557, - "lr": 9.66705010224949e-06, - "objective/entropy": -204.13546752929688, - "objective/kl": 28.699504852294922, - "objective/non_score_reward": -1.4349753856658936, - "objective/rlhf_reward": -4.361299076167446, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 2.3828086853027344, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.66796875, - "step": 521, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0017244815826416 - }, - { - "episode": 8368, - "epoch": 0.05013720626475417, - "loss/policy_avg": 0.13512714207172394, - "lr": 9.666411042944787e-06, - "objective/entropy": -234.03375244140625, - "objective/kl": 27.24090576171875, - "objective/non_score_reward": -1.3620452880859375, - "objective/rlhf_reward": -3.932409131320652, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 27.1795654296875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.689453125, - "step": 522, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999112844467163 - }, - { - "episode": 8384, - "epoch": 0.050233070903883716, - "loss/policy_avg": -0.011349002830684185, - "lr": 9.665771983640082e-06, - "objective/entropy": -252.35935974121094, - "objective/kl": 35.68749237060547, - "objective/non_score_reward": -1.784374713897705, - "objective/rlhf_reward": -5.7588963890946925, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 13.969385147094727, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.626953125, - "step": 523, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 1.9982863664627075 - }, - { - "episode": 8400, - "epoch": 0.050328935543013265, - "loss/policy_avg": 0.03610409051179886, - "lr": 9.665132924335379e-06, - "objective/entropy": -18.527732849121094, - "objective/kl": 31.889944076538086, - "objective/non_score_reward": -1.5944972038269043, - "objective/rlhf_reward": -4.927390317530975, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 69.35887145996094, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.86328125, - "step": 524, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999839425086975 - }, - { - "episode": 8416, - "epoch": 0.050424800182142814, - "loss/policy_avg": 0.4427942633628845, - "lr": 9.664493865030676e-06, - "objective/entropy": -203.7809295654297, - "objective/kl": 25.36702537536621, - "objective/non_score_reward": -1.2683511972427368, - "objective/rlhf_reward": -3.6495729281502642, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 22.38974380493164, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.828125, - "step": 525, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989018440246582 - }, - { - "episode": 8432, - "epoch": 0.05052066482127236, - "loss/policy_avg": 1.6773953437805176, - "lr": 9.663854805725971e-06, - "objective/entropy": -146.93841552734375, - "objective/kl": 37.069419860839844, - "objective/non_score_reward": -1.853471040725708, - "objective/rlhf_reward": -5.990052063663569, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 11.231493949890137, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.53515625, - "step": 526, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9981741905212402 - }, - { - "episode": 8448, - "epoch": 0.05061652946040191, - "loss/policy_avg": -0.08897572010755539, - "lr": 9.663215746421268e-06, - "objective/entropy": -158.65708923339844, - "objective/kl": 23.60004997253418, - "objective/non_score_reward": -1.1800025701522827, - "objective/rlhf_reward": -3.394497547179384, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 10.824882507324219, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6953125, - "step": 527, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9995331764221191 - }, - { - "episode": 8464, - "epoch": 0.05071239409953146, - "loss/policy_avg": 0.024341005831956863, - "lr": 9.662576687116565e-06, - "objective/entropy": -174.72035217285156, - "objective/kl": 29.104461669921875, - "objective/non_score_reward": -1.4552230834960938, - "objective/rlhf_reward": -4.479256918936401, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 17.054231643676758, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.470703125, - "step": 528, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999741554260254 - }, - { - "episode": 8480, - "epoch": 0.05080825873866101, - "loss/policy_avg": 0.257159948348999, - "lr": 9.661937627811862e-06, - "objective/entropy": -200.30184936523438, - "objective/kl": 23.69171905517578, - "objective/non_score_reward": -1.1845859289169312, - "objective/rlhf_reward": -3.338343775272369, - "objective/scores": 0.35, - "policy/approxkl_avg": 6.550008773803711, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.595703125, - "step": 529, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9984736442565918 - }, - { - "episode": 8496, - "epoch": 0.05090412337779056, - "loss/policy_avg": 0.4184650182723999, - "lr": 9.661298568507158e-06, - "objective/entropy": -344.7420959472656, - "objective/kl": 24.219188690185547, - "objective/non_score_reward": -1.2109594345092773, - "objective/rlhf_reward": -3.4652354205525935, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 67.58980560302734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.681640625, - "step": 530, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9985125064849854 - }, - { - "episode": 8512, - "epoch": 0.05099998801692011, - "loss/policy_avg": -0.0187949538230896, - "lr": 9.660659509202455e-06, - "objective/entropy": -14.01883316040039, - "objective/kl": 29.49643325805664, - "objective/non_score_reward": -1.47482168674469, - "objective/rlhf_reward": -4.520684697715145, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 6.090343475341797, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.59765625, - "step": 531, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0014190673828125 - }, - { - "episode": 8528, - "epoch": 0.051095852656049656, - "loss/policy_avg": 0.5480527877807617, - "lr": 9.66002044989775e-06, - "objective/entropy": -169.82949829101562, - "objective/kl": 34.57899475097656, - "objective/non_score_reward": -1.728949785232544, - "objective/rlhf_reward": -5.434846642430186, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 7.255028247833252, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.638671875, - "step": 532, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9971615076065063 - }, - { - "episode": 8544, - "epoch": 0.051191717295179205, - "loss/policy_avg": 0.2761814594268799, - "lr": 9.659381390593047e-06, - "objective/entropy": -100.77452850341797, - "objective/kl": 36.835365295410156, - "objective/non_score_reward": -1.8417682647705078, - "objective/rlhf_reward": -6.007823192809505, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 50.438026428222656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625, - "step": 533, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9985384941101074 - }, - { - "episode": 8560, - "epoch": 0.051287581934308754, - "loss/policy_avg": 0.4119563698768616, - "lr": 9.658742331288344e-06, - "objective/entropy": -65.70556640625, - "objective/kl": 29.577213287353516, - "objective/non_score_reward": -1.47886061668396, - "objective/rlhf_reward": -3.792736174837623, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 7.75493049621582, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4990234375, - "step": 534, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002697467803955 - }, - { - "episode": 8576, - "epoch": 0.0513834465734383, - "loss/policy_avg": 0.12609338760375977, - "lr": 9.658103271983641e-06, - "objective/entropy": -150.71954345703125, - "objective/kl": 28.952709197998047, - "objective/non_score_reward": -1.447635531425476, - "objective/rlhf_reward": -4.3905422449111935, - "objective/scores": 0.35, - "policy/approxkl_avg": 34.924835205078125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.615234375, - "step": 535, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0004661083221436 - }, - { - "episode": 8592, - "epoch": 0.05147931121256785, - "loss/policy_avg": 0.014640828594565392, - "lr": 9.657464212678938e-06, - "objective/entropy": -37.74507141113281, - "objective/kl": 25.910266876220703, - "objective/non_score_reward": -1.295513391494751, - "objective/rlhf_reward": -0.7820532083511349, - "objective/scores": 1.1, - "policy/approxkl_avg": 2.0191965103149414, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.751953125, - "step": 536, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0010244846343994 - }, - { - "episode": 8608, - "epoch": 0.0515751758516974, - "loss/policy_avg": 0.04429921880364418, - "lr": 9.656825153374235e-06, - "objective/entropy": -26.176483154296875, - "objective/kl": 32.8004264831543, - "objective/non_score_reward": -1.6400213241577148, - "objective/rlhf_reward": -4.826751814285913, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 42.128135681152344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.908203125, - "step": 537, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0013113021850586 - }, - { - "episode": 8624, - "epoch": 0.05167104049082695, - "loss/policy_avg": 0.46547916531562805, - "lr": 9.65618609406953e-06, - "objective/entropy": 7.776313781738281, - "objective/kl": 28.19791030883789, - "objective/non_score_reward": -1.4098955392837524, - "objective/rlhf_reward": -3.906248764197031, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 4.504173755645752, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8828125, - "step": 538, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998396635055542 - }, - { - "episode": 8640, - "epoch": 0.0517669051299565, - "loss/policy_avg": 0.0001214742660522461, - "lr": 9.655547034764827e-06, - "objective/entropy": -112.6850357055664, - "objective/kl": 31.756372451782227, - "objective/non_score_reward": -1.5878187417984009, - "objective/rlhf_reward": -4.228568734900032, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 2.7504100799560547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.546875, - "step": 539, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0001778602600098 - }, - { - "episode": 8656, - "epoch": 0.05186276976908605, - "loss/policy_avg": 0.41524794697761536, - "lr": 9.654907975460124e-06, - "objective/entropy": -135.01878356933594, - "objective/kl": 23.119266510009766, - "objective/non_score_reward": -1.1559633016586304, - "objective/rlhf_reward": -3.0675939609676153, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 26.581480026245117, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6015625, - "step": 540, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9962246417999268 - }, - { - "episode": 8672, - "epoch": 0.0519586344082156, - "loss/policy_avg": 0.3321428894996643, - "lr": 9.65426891615542e-06, - "objective/entropy": -5.44740104675293, - "objective/kl": 39.89240264892578, - "objective/non_score_reward": -1.9946203231811523, - "objective/rlhf_reward": -7.97848105430603, - "objective/scores": 0.0, - "policy/approxkl_avg": 67.52932739257812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 541, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9986319541931152 - }, - { - "episode": 8688, - "epoch": 0.052054499047345146, - "loss/policy_avg": 0.22704890370368958, - "lr": 9.653629856850718e-06, - "objective/entropy": 23.631000518798828, - "objective/kl": 22.43924331665039, - "objective/non_score_reward": -1.121962308883667, - "objective/rlhf_reward": -3.109246918050152, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 40.600868225097656, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.708984375, - "step": 542, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0008883476257324 - }, - { - "episode": 8704, - "epoch": 0.052150363686474695, - "loss/policy_avg": 0.6167892217636108, - "lr": 9.652990797546013e-06, - "objective/entropy": 8.02947998046875, - "objective/kl": 34.78337478637695, - "objective/non_score_reward": -1.739168643951416, - "objective/rlhf_reward": -5.556674695014953, - "objective/scores": 0.35, - "policy/approxkl_avg": 7.763035774230957, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.55859375, - "step": 543, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983458518981934 - }, - { - "episode": 8720, - "epoch": 0.052246228325604244, - "loss/policy_avg": 0.1720658838748932, - "lr": 9.65235173824131e-06, - "objective/entropy": 0.5252876281738281, - "objective/kl": 31.73941993713379, - "objective/non_score_reward": -1.5869710445404053, - "objective/rlhf_reward": -4.79162499209936, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 6.366281509399414, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.76953125, - "step": 544, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9988775253295898 - }, - { - "episode": 8736, - "epoch": 0.05234209296473379, - "loss/policy_avg": 0.07084909081459045, - "lr": 9.651712678936605e-06, - "objective/entropy": -50.734527587890625, - "objective/kl": 24.657032012939453, - "objective/non_score_reward": -1.2328516244888306, - "objective/rlhf_reward": -3.1065776899185886, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 12.337860107421875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.521484375, - "step": 545, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9985425472259521 - }, - { - "episode": 8752, - "epoch": 0.05243795760386334, - "loss/policy_avg": -0.053861357271671295, - "lr": 9.651073619631902e-06, - "objective/entropy": -242.29559326171875, - "objective/kl": 21.178913116455078, - "objective/non_score_reward": -1.058945655822754, - "objective/rlhf_reward": -2.6316629386583146, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 23.818538665771484, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.62109375, - "step": 546, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0112152099609375 - }, - { - "episode": 8768, - "epoch": 0.0525338222429929, - "loss/policy_avg": -0.008508548140525818, - "lr": 9.650434560327199e-06, - "objective/entropy": -46.92424011230469, - "objective/kl": 39.04132843017578, - "objective/non_score_reward": -1.952066421508789, - "objective/rlhf_reward": -6.429663398352963, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 15.27535629272461, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4677734375, - "step": 547, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982926845550537 - }, - { - "episode": 8784, - "epoch": 0.052629686882122446, - "loss/policy_avg": 0.17654258012771606, - "lr": 9.649795501022496e-06, - "objective/entropy": -44.7242431640625, - "objective/kl": 19.804813385009766, - "objective/non_score_reward": -0.9902406930923462, - "objective/rlhf_reward": -2.635450038939638, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 39.75682067871094, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.76953125, - "step": 548, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002223014831543 - }, - { - "episode": 8800, - "epoch": 0.052725551521251995, - "loss/policy_avg": 0.46367156505584717, - "lr": 9.649156441717792e-06, - "objective/entropy": -132.18556213378906, - "objective/kl": 38.18450927734375, - "objective/non_score_reward": -1.909225344657898, - "objective/rlhf_reward": -6.0327816343942455, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 24.263263702392578, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7421875, - "step": 549, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9974383115768433 - }, - { - "episode": 8816, - "epoch": 0.052821416160381544, - "loss/policy_avg": 0.2747136950492859, - "lr": 9.64851738241309e-06, - "objective/entropy": -91.26388549804688, - "objective/kl": 28.735111236572266, - "objective/non_score_reward": -1.4367555379867554, - "objective/rlhf_reward": -4.085162764013396, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 5.113122940063477, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.626953125, - "step": 550, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000535011291504 - }, - { - "episode": 8832, - "epoch": 0.05291728079951109, - "loss/policy_avg": 0.031243963167071342, - "lr": 9.647878323108384e-06, - "objective/entropy": -40.358192443847656, - "objective/kl": 31.673667907714844, - "objective/non_score_reward": -1.5836834907531738, - "objective/rlhf_reward": -4.993098309546142, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 78.17581939697266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.796875, - "step": 551, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989157915115356 - }, - { - "episode": 8848, - "epoch": 0.05301314543864064, - "loss/policy_avg": -0.28017422556877136, - "lr": 9.647239263803681e-06, - "objective/entropy": -100.97856140136719, - "objective/kl": 33.18678283691406, - "objective/non_score_reward": -1.659339189529419, - "objective/rlhf_reward": -6.637356638908386, - "objective/scores": 0.0, - "policy/approxkl_avg": 6.006505012512207, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.572265625, - "step": 552, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.003824234008789 - }, - { - "episode": 8864, - "epoch": 0.05310901007777019, - "loss/policy_avg": 0.04892890527844429, - "lr": 9.646600204498978e-06, - "objective/entropy": -136.31918334960938, - "objective/kl": 19.06879997253418, - "objective/non_score_reward": -0.9534400105476379, - "objective/rlhf_reward": -2.2575007369190008, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 1.5354987382888794, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.572265625, - "step": 553, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.003046989440918 - }, - { - "episode": 8880, - "epoch": 0.05320487471689974, - "loss/policy_avg": 0.1114959716796875, - "lr": 9.645961145194275e-06, - "objective/entropy": -125.14915466308594, - "objective/kl": 41.65575408935547, - "objective/non_score_reward": -2.0827877521514893, - "objective/rlhf_reward": -6.383740137295659, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 12.4759521484375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.66796875, - "step": 554, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973247051239014 - }, - { - "episode": 8896, - "epoch": 0.05330073935602929, - "loss/policy_avg": 0.2784144878387451, - "lr": 9.645322085889572e-06, - "objective/entropy": -42.213340759277344, - "objective/kl": 34.43170928955078, - "objective/non_score_reward": -1.7215855121612549, - "objective/rlhf_reward": -6.8863421976566315, - "objective/scores": 0.0, - "policy/approxkl_avg": 37.5791015625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.546875, - "step": 555, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9974336624145508 - }, - { - "episode": 8912, - "epoch": 0.05339660399515884, - "loss/policy_avg": -0.0683375895023346, - "lr": 9.644683026584867e-06, - "objective/entropy": -94.292724609375, - "objective/kl": 29.925048828125, - "objective/non_score_reward": -1.4962522983551025, - "objective/rlhf_reward": -4.4287500669627935, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 0.9679741263389587, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.53125, - "step": 556, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.002323627471924 - }, - { - "episode": 8928, - "epoch": 0.05349246863428839, - "loss/policy_avg": 0.3528517484664917, - "lr": 9.644043967280164e-06, - "objective/entropy": 100.1601791381836, - "objective/kl": 29.87194061279297, - "objective/non_score_reward": -1.4935970306396484, - "objective/rlhf_reward": -4.493435802872538, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 21.40321922302246, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.54296875, - "step": 557, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999911904335022 - }, - { - "episode": 8944, - "epoch": 0.053588333273417936, - "loss/policy_avg": 0.15664523839950562, - "lr": 9.643404907975461e-06, - "objective/entropy": -163.13458251953125, - "objective/kl": 43.485382080078125, - "objective/non_score_reward": -2.174269199371338, - "objective/rlhf_reward": -6.297076797485351, - "objective/scores": 0.6, - "policy/approxkl_avg": 28.333932876586914, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.52734375, - "step": 558, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9964426755905151 - }, - { - "episode": 8960, - "epoch": 0.053684197912547485, - "loss/policy_avg": 0.6344835162162781, - "lr": 9.642765848670758e-06, - "objective/entropy": -252.752685546875, - "objective/kl": 33.16960144042969, - "objective/non_score_reward": -1.658479928970337, - "objective/rlhf_reward": -5.255317785827023, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 52.37012481689453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.771484375, - "step": 559, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.0004310607910156 - }, - { - "episode": 8976, - "epoch": 0.053780062551677034, - "loss/policy_avg": 0.19869406521320343, - "lr": 9.642126789366055e-06, - "objective/entropy": -50.086647033691406, - "objective/kl": 30.926883697509766, - "objective/non_score_reward": -1.5463443994522095, - "objective/rlhf_reward": -4.629118292537287, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 26.995628356933594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.626953125, - "step": 560, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9950168132781982 - }, - { - "episode": 8992, - "epoch": 0.05387592719080658, - "loss/policy_avg": -0.010918349027633667, - "lr": 9.641487730061352e-06, - "objective/entropy": -168.9771728515625, - "objective/kl": 22.5106201171875, - "objective/non_score_reward": -1.1255309581756592, - "objective/rlhf_reward": -3.1604882984453733, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 20.162094116210938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.763671875, - "step": 561, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.001406669616699 - }, - { - "episode": 9008, - "epoch": 0.05397179182993613, - "loss/policy_avg": 0.4963573217391968, - "lr": 9.640848670756647e-06, - "objective/entropy": -159.58302307128906, - "objective/kl": 34.39787673950195, - "objective/non_score_reward": -1.7198940515518188, - "objective/rlhf_reward": -5.455743868549433, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 32.154441833496094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.59375, - "step": 562, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99989652633667 - }, - { - "episode": 9024, - "epoch": 0.05406765646906568, - "loss/policy_avg": 0.4512660503387451, - "lr": 9.640209611451944e-06, - "objective/entropy": -112.33628845214844, - "objective/kl": 34.371681213378906, - "objective/non_score_reward": -1.7185840606689453, - "objective/rlhf_reward": -5.515086495612545, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 4.578237056732178, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.599609375, - "step": 563, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9984909296035767 - }, - { - "episode": 9040, - "epoch": 0.05416352110819523, - "loss/policy_avg": 0.08781366050243378, - "lr": 9.63957055214724e-06, - "objective/entropy": -39.49800491333008, - "objective/kl": 33.1617431640625, - "objective/non_score_reward": -1.6580872535705566, - "objective/rlhf_reward": -4.232348775863647, - "objective/scores": 0.6, - "policy/approxkl_avg": 4.19449520111084, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.677734375, - "step": 564, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000304698944092 - }, - { - "episode": 9056, - "epoch": 0.05425938574732478, - "loss/policy_avg": 0.02701903134584427, - "lr": 9.638931492842537e-06, - "objective/entropy": -135.10118103027344, - "objective/kl": 34.19304656982422, - "objective/non_score_reward": -1.7096521854400635, - "objective/rlhf_reward": -5.388010840030059, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 18.33478546142578, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.61328125, - "step": 565, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999070644378662 - }, - { - "episode": 9072, - "epoch": 0.05435525038645433, - "loss/policy_avg": 0.2804332375526428, - "lr": 9.638292433537834e-06, - "objective/entropy": -100.01052856445312, - "objective/kl": 28.388795852661133, - "objective/non_score_reward": -1.4194397926330566, - "objective/rlhf_reward": -5.677759170532227, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.587360382080078, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.68359375, - "step": 566, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0005486011505127 - }, - { - "episode": 9088, - "epoch": 0.05445111502558388, - "loss/policy_avg": 0.4314262866973877, - "lr": 9.63765337423313e-06, - "objective/entropy": -130.2495574951172, - "objective/kl": 35.38700866699219, - "objective/non_score_reward": -1.7693501710891724, - "objective/rlhf_reward": -5.4155414156323545, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 44.93388366699219, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.75390625, - "step": 567, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9968568086624146 - }, - { - "episode": 9104, - "epoch": 0.054546979664713426, - "loss/policy_avg": 0.3399587869644165, - "lr": 9.637014314928426e-06, - "objective/entropy": -247.61073303222656, - "objective/kl": 28.445119857788086, - "objective/non_score_reward": -1.4222559928894043, - "objective/rlhf_reward": -3.864195342334818, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 7.162724018096924, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.607421875, - "step": 568, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9984673261642456 - }, - { - "episode": 9120, - "epoch": 0.054642844303842975, - "loss/policy_avg": 0.5520263314247131, - "lr": 9.636375255623721e-06, - "objective/entropy": -97.92376708984375, - "objective/kl": 26.055057525634766, - "objective/non_score_reward": -1.30275297164917, - "objective/rlhf_reward": -3.088305356279884, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 36.18694305419922, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.494140625, - "step": 569, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0012941360473633 - }, - { - "episode": 9136, - "epoch": 0.054738708942972523, - "loss/policy_avg": 0.09734541922807693, - "lr": 9.635736196319018e-06, - "objective/entropy": -196.53872680664062, - "objective/kl": 23.71702003479004, - "objective/non_score_reward": -1.185850977897644, - "objective/rlhf_reward": -4.743403911590576, - "objective/scores": 0.0, - "policy/approxkl_avg": 2.213500738143921, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.580078125, - "step": 570, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993475675582886 - }, - { - "episode": 9152, - "epoch": 0.05483457358210207, - "loss/policy_avg": 0.4516823887825012, - "lr": 9.635097137014315e-06, - "objective/entropy": -126.11761474609375, - "objective/kl": 28.336185455322266, - "objective/non_score_reward": -1.4168094396591187, - "objective/rlhf_reward": -1.2672375202178952, - "objective/scores": 1.1, - "policy/approxkl_avg": 44.684326171875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.775390625, - "step": 571, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990687370300293 - }, - { - "episode": 9168, - "epoch": 0.05493043822123162, - "loss/policy_avg": 0.34894299507141113, - "lr": 9.634458077709612e-06, - "objective/entropy": -3.410472869873047, - "objective/kl": 35.99509048461914, - "objective/non_score_reward": -1.7997545003890991, - "objective/rlhf_reward": -5.87350514891736, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 4.621858596801758, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.634765625, - "step": 572, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0019896030426025 - }, - { - "episode": 9184, - "epoch": 0.05502630286036117, - "loss/policy_avg": 0.1023169457912445, - "lr": 9.633819018404909e-06, - "objective/entropy": -180.73724365234375, - "objective/kl": 24.693328857421875, - "objective/non_score_reward": -1.2346664667129517, - "objective/rlhf_reward": -3.5386658668518063, - "objective/scores": 0.35, - "policy/approxkl_avg": 22.89309310913086, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.64453125, - "step": 573, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9981255531311035 - }, - { - "episode": 9200, - "epoch": 0.05512216749949072, - "loss/policy_avg": 0.2509443163871765, - "lr": 9.633179959100206e-06, - "objective/entropy": -268.43072509765625, - "objective/kl": 28.437435150146484, - "objective/non_score_reward": -1.4218716621398926, - "objective/rlhf_reward": -4.131227611508921, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 60.228729248046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.78125, - "step": 574, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0000219345092773 - }, - { - "episode": 9216, - "epoch": 0.05521803213862027, - "loss/policy_avg": -0.04683633893728256, - "lr": 9.632540899795501e-06, - "objective/entropy": -70.71329498291016, - "objective/kl": 38.51101303100586, - "objective/non_score_reward": -1.9255508184432983, - "objective/rlhf_reward": -5.877374465736459, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 3.3532156944274902, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.615234375, - "step": 575, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000458002090454 - }, - { - "episode": 9232, - "epoch": 0.05531389677774982, - "loss/policy_avg": 0.25571292638778687, - "lr": 9.631901840490798e-06, - "objective/entropy": -197.88787841796875, - "objective/kl": 25.574037551879883, - "objective/non_score_reward": -1.278701901435852, - "objective/rlhf_reward": -3.3814741532007853, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 6.096738815307617, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.671875, - "step": 576, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001577854156494 - }, - { - "episode": 9248, - "epoch": 0.055409761416879366, - "loss/policy_avg": 0.7064580917358398, - "lr": 9.631262781186095e-06, - "objective/entropy": -150.29953002929688, - "objective/kl": 30.821884155273438, - "objective/non_score_reward": -1.5410943031311035, - "objective/rlhf_reward": -4.43104387919108, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 43.45115280151367, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.60546875, - "step": 577, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9980316162109375 - }, - { - "episode": 9264, - "epoch": 0.055505626056008915, - "loss/policy_avg": 0.20062510669231415, - "lr": 9.630623721881392e-06, - "objective/entropy": -158.88388061523438, - "objective/kl": 28.73421859741211, - "objective/non_score_reward": -1.4367109537124634, - "objective/rlhf_reward": -4.346843814849853, - "objective/scores": 0.35, - "policy/approxkl_avg": 12.110857963562012, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69140625, - "step": 578, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998002290725708 - }, - { - "episode": 9280, - "epoch": 0.055601490695138464, - "loss/policy_avg": 0.08450721949338913, - "lr": 9.629984662576689e-06, - "objective/entropy": -250.45445251464844, - "objective/kl": 27.57752227783203, - "objective/non_score_reward": -1.3788762092590332, - "objective/rlhf_reward": -4.064906816096649, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 17.175188064575195, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5859375, - "step": 579, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.997262716293335 - }, - { - "episode": 9296, - "epoch": 0.05569735533426801, - "loss/policy_avg": 0.41482874751091003, - "lr": 9.629345603271984e-06, - "objective/entropy": -177.06607055664062, - "objective/kl": 29.43456268310547, - "objective/non_score_reward": -1.4717282056808472, - "objective/rlhf_reward": -2.9631939872514934, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 50.86977005004883, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.595703125, - "step": 580, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0005576610565186 - }, - { - "episode": 9312, - "epoch": 0.05579321997339756, - "loss/policy_avg": 0.20043331384658813, - "lr": 9.62870654396728e-06, - "objective/entropy": -224.79660034179688, - "objective/kl": 23.171340942382812, - "objective/non_score_reward": -1.1585670709609985, - "objective/rlhf_reward": -2.6868569953011825, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 4.841948986053467, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.603515625, - "step": 581, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.00144362449646 - }, - { - "episode": 9328, - "epoch": 0.05588908461252711, - "loss/policy_avg": 0.28447139263153076, - "lr": 9.628067484662578e-06, - "objective/entropy": -44.1309814453125, - "objective/kl": 42.387351989746094, - "objective/non_score_reward": -2.1193675994873047, - "objective/rlhf_reward": -7.151957724124117, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 20.72610092163086, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.419921875, - "step": 582, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9971909523010254 - }, - { - "episode": 9344, - "epoch": 0.05598494925165666, - "loss/policy_avg": 0.09533769637346268, - "lr": 9.627428425357874e-06, - "objective/entropy": -218.9058380126953, - "objective/kl": 27.360652923583984, - "objective/non_score_reward": -1.368032693862915, - "objective/rlhf_reward": -4.021532396884307, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 11.28432846069336, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7421875, - "step": 583, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9974052906036377 - }, - { - "episode": 9360, - "epoch": 0.05608081389078621, - "loss/policy_avg": 0.5065032243728638, - "lr": 9.626789366053171e-06, - "objective/entropy": -231.38427734375, - "objective/kl": 32.08224105834961, - "objective/non_score_reward": -1.604112148284912, - "objective/rlhf_reward": -5.0748127012545154, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 40.948760986328125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.69921875, - "step": 584, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9988150596618652 - }, - { - "episode": 9376, - "epoch": 0.05617667852991576, - "loss/policy_avg": 0.6530688405036926, - "lr": 9.626150306748468e-06, - "objective/entropy": -116.65798950195312, - "objective/kl": 31.407730102539062, - "objective/non_score_reward": -1.570386528968811, - "objective/rlhf_reward": -4.902944007006985, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 13.348186492919922, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.54296875, - "step": 585, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000188112258911 - }, - { - "episode": 9392, - "epoch": 0.05627254316904531, - "loss/policy_avg": -0.06093317270278931, - "lr": 9.625511247443763e-06, - "objective/entropy": -245.7208251953125, - "objective/kl": 22.28873634338379, - "objective/non_score_reward": -1.1144368648529053, - "objective/rlhf_reward": -2.33504098869947, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 1.7080774307250977, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.681640625, - "step": 586, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.002075672149658 - }, - { - "episode": 9408, - "epoch": 0.056368407808174856, - "loss/policy_avg": 0.4493389129638672, - "lr": 9.62487218813906e-06, - "objective/entropy": -11.156410217285156, - "objective/kl": 29.71312141418457, - "objective/non_score_reward": -1.4856561422348022, - "objective/rlhf_reward": -4.117795641693186, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 18.012893676757812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.759765625, - "step": 587, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000352382659912 - }, - { - "episode": 9424, - "epoch": 0.056464272447304405, - "loss/policy_avg": 0.3274408280849457, - "lr": 9.624233128834357e-06, - "objective/entropy": -116.3506088256836, - "objective/kl": 35.94437026977539, - "objective/non_score_reward": -1.7972185611724854, - "objective/rlhf_reward": -4.788874185085296, - "objective/scores": 0.6, - "policy/approxkl_avg": 17.158645629882812, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.69140625, - "step": 588, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9996552467346191 - }, - { - "episode": 9440, - "epoch": 0.056560137086433954, - "loss/policy_avg": 0.879096508026123, - "lr": 9.623594069529654e-06, - "objective/entropy": -152.50155639648438, - "objective/kl": 32.464576721191406, - "objective/non_score_reward": -1.623228669166565, - "objective/rlhf_reward": -5.069082756240931, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 70.49058532714844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.849609375, - "step": 589, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001586437225342 - }, - { - "episode": 9456, - "epoch": 0.0566560017255635, - "loss/policy_avg": 0.2921786904335022, - "lr": 9.62295501022495e-06, - "objective/entropy": -177.27088928222656, - "objective/kl": 39.783531188964844, - "objective/non_score_reward": -1.989176630973816, - "objective/rlhf_reward": -6.57810423621307, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 77.26689147949219, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6796875, - "step": 590, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9989970922470093 - }, - { - "episode": 9472, - "epoch": 0.05675186636469305, - "loss/policy_avg": 0.3912142515182495, - "lr": 9.622315950920246e-06, - "objective/entropy": -120.1540756225586, - "objective/kl": 31.21270179748535, - "objective/non_score_reward": -1.5606350898742676, - "objective/rlhf_reward": -3.842540299892425, - "objective/scores": 0.6, - "policy/approxkl_avg": 25.256790161132812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.654296875, - "step": 591, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9980988502502441 - }, - { - "episode": 9488, - "epoch": 0.0568477310038226, - "loss/policy_avg": 0.04369340091943741, - "lr": 9.621676891615543e-06, - "objective/entropy": -277.40753173828125, - "objective/kl": 29.685585021972656, - "objective/non_score_reward": -1.4842792749404907, - "objective/rlhf_reward": -1.5371170997619625, - "objective/scores": 1.1, - "policy/approxkl_avg": 7.890674591064453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.677734375, - "step": 592, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9981930255889893 - }, - { - "episode": 9504, - "epoch": 0.05694359564295215, - "loss/policy_avg": 0.05721379816532135, - "lr": 9.621037832310838e-06, - "objective/entropy": -257.69232177734375, - "objective/kl": 23.966060638427734, - "objective/non_score_reward": -1.19830322265625, - "objective/rlhf_reward": -3.0598793412248293, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 20.133102416992188, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.732421875, - "step": 593, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.001232624053955 - }, - { - "episode": 9520, - "epoch": 0.0570394602820817, - "loss/policy_avg": 0.5772296786308289, - "lr": 9.620398773006135e-06, - "objective/entropy": -89.6330795288086, - "objective/kl": 31.078372955322266, - "objective/non_score_reward": -1.5539186000823975, - "objective/rlhf_reward": -4.734722021038889, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 21.1763916015625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.87109375, - "step": 594, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000513792037964 - }, - { - "episode": 9536, - "epoch": 0.05713532492121125, - "loss/policy_avg": -0.026315592229366302, - "lr": 9.619759713701432e-06, - "objective/entropy": -219.30979919433594, - "objective/kl": 26.461135864257812, - "objective/non_score_reward": -1.323056697845459, - "objective/rlhf_reward": -3.9329772827371787, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 8.585318565368652, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.677734375, - "step": 595, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0008788108825684 - }, - { - "episode": 9552, - "epoch": 0.057231189560340796, - "loss/policy_avg": 0.2548080384731293, - "lr": 9.619120654396729e-06, - "objective/entropy": -37.27716827392578, - "objective/kl": 44.03446960449219, - "objective/non_score_reward": -2.201723575592041, - "objective/rlhf_reward": -7.356295923800811, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 21.06201934814453, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.63671875, - "step": 596, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9991514682769775 - }, - { - "episode": 9568, - "epoch": 0.057327054199470345, - "loss/policy_avg": 2.5911049842834473, - "lr": 9.618481595092026e-06, - "objective/entropy": -171.7782745361328, - "objective/kl": 20.800029754638672, - "objective/non_score_reward": -1.0400015115737915, - "objective/rlhf_reward": -1.760006046295166, - "objective/scores": 0.6, - "policy/approxkl_avg": 2.9469943046569824, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484375, - "step": 597, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.014529228210449 - }, - { - "episode": 9584, - "epoch": 0.057422918838599894, - "loss/policy_avg": -0.1166892945766449, - "lr": 9.617842535787323e-06, - "objective/entropy": -109.67333221435547, - "objective/kl": 34.37934494018555, - "objective/non_score_reward": -1.7189671993255615, - "objective/rlhf_reward": -6.8758686780929565, - "objective/scores": 0.0, - "policy/approxkl_avg": 17.377391815185547, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4794921875, - "step": 598, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0002119541168213 - }, - { - "episode": 9600, - "epoch": 0.05751878347772944, - "loss/policy_avg": -0.15396325290203094, - "lr": 9.617203476482618e-06, - "objective/entropy": -128.05728149414062, - "objective/kl": 29.42688751220703, - "objective/non_score_reward": -1.4713443517684937, - "objective/rlhf_reward": -4.060548658641886, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 2.408236026763916, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.71484375, - "step": 599, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.002739429473877 - }, - { - "episode": 9616, - "epoch": 0.05761464811685899, - "loss/policy_avg": 0.14407247304916382, - "lr": 9.616564417177915e-06, - "objective/entropy": -272.3529357910156, - "objective/kl": 21.596874237060547, - "objective/non_score_reward": -1.0798437595367432, - "objective/rlhf_reward": -1.3956560238611426, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 7.104412078857422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.58984375, - "step": 600, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001608371734619 - }, - { - "episode": 9632, - "epoch": 0.05771051275598854, - "loss/policy_avg": 0.20445303618907928, - "lr": 9.615925357873211e-06, - "objective/entropy": -291.0384521484375, - "objective/kl": 28.06856918334961, - "objective/non_score_reward": -1.403428554534912, - "objective/rlhf_reward": -4.235111692038876, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 9.333198547363281, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.720703125, - "step": 601, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0009381771087646 - }, - { - "episode": 9648, - "epoch": 0.05780637739511809, - "loss/policy_avg": 0.7656448483467102, - "lr": 9.615286298568508e-06, - "objective/entropy": -4.355806350708008, - "objective/kl": 34.863006591796875, - "objective/non_score_reward": -1.7431503534317017, - "objective/rlhf_reward": -5.548769433696833, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 10.645190238952637, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.591796875, - "step": 602, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9971041679382324 - }, - { - "episode": 9664, - "epoch": 0.05790224203424764, - "loss/policy_avg": 0.1100698709487915, - "lr": 9.614647239263805e-06, - "objective/entropy": -203.49618530273438, - "objective/kl": 19.046649932861328, - "objective/non_score_reward": -0.9523325562477112, - "objective/rlhf_reward": -2.4307281161225855, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 0.499467670917511, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.638671875, - "step": 603, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0019733905792236 - }, - { - "episode": 9680, - "epoch": 0.05799810667337719, - "loss/policy_avg": 0.17878472805023193, - "lr": 9.6140081799591e-06, - "objective/entropy": -162.996826171875, - "objective/kl": 23.458127975463867, - "objective/non_score_reward": -1.172906517982483, - "objective/rlhf_reward": -3.3661131596862504, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 8.434497833251953, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5390625, - "step": 604, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9986371994018555 - }, - { - "episode": 9696, - "epoch": 0.058093971312506744, - "loss/policy_avg": 0.5608217716217041, - "lr": 9.613369120654397e-06, - "objective/entropy": -168.91802978515625, - "objective/kl": 31.90495491027832, - "objective/non_score_reward": -1.5952478647232056, - "objective/rlhf_reward": -3.4572724446069927, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 10.658321380615234, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.57421875, - "step": 605, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999322772026062 - }, - { - "episode": 9712, - "epoch": 0.05818983595163629, - "loss/policy_avg": 0.10194225609302521, - "lr": 9.612730061349694e-06, - "objective/entropy": -138.00286865234375, - "objective/kl": 34.8355712890625, - "objective/non_score_reward": -1.7417783737182617, - "objective/rlhf_reward": -5.641600999861879, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 19.823665618896484, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.765625, - "step": 606, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000347852706909 - }, - { - "episode": 9728, - "epoch": 0.05828570059076584, - "loss/policy_avg": 1.170401930809021, - "lr": 9.612091002044991e-06, - "objective/entropy": -171.179443359375, - "objective/kl": 23.883764266967773, - "objective/non_score_reward": -1.1941882371902466, - "objective/rlhf_reward": -3.2609813449704017, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 1.674392819404602, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.654296875, - "step": 607, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0004193782806396 - }, - { - "episode": 9744, - "epoch": 0.05838156522989539, - "loss/policy_avg": 0.05054464191198349, - "lr": 9.611451942740288e-06, - "objective/entropy": -196.56436157226562, - "objective/kl": 23.218883514404297, - "objective/non_score_reward": -1.1609442234039307, - "objective/rlhf_reward": -3.1931789918855276, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 11.145727157592773, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.60546875, - "step": 608, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998692512512207 - }, - { - "episode": 9760, - "epoch": 0.05847742986902494, - "loss/policy_avg": 0.054385945200920105, - "lr": 9.610812883435585e-06, - "objective/entropy": -244.93141174316406, - "objective/kl": 29.985477447509766, - "objective/non_score_reward": -1.4992740154266357, - "objective/rlhf_reward": -4.637845957015438, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 19.703460693359375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69140625, - "step": 609, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0000925064086914 - }, - { - "episode": 9776, - "epoch": 0.05857329450815449, - "loss/policy_avg": -0.05685323104262352, - "lr": 9.61017382413088e-06, - "objective/entropy": -65.63417053222656, - "objective/kl": 31.53623390197754, - "objective/non_score_reward": -1.5768117904663086, - "objective/rlhf_reward": -3.383528147579405, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 12.860790252685547, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.59375, - "step": 610, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001614809036255 - }, - { - "episode": 9792, - "epoch": 0.05866915914728404, - "loss/policy_avg": 0.20876801013946533, - "lr": 9.609534764826177e-06, - "objective/entropy": -112.53227996826172, - "objective/kl": 41.12568664550781, - "objective/non_score_reward": -2.0562844276428223, - "objective/rlhf_reward": -5.825137710571289, - "objective/scores": 0.6, - "policy/approxkl_avg": 33.385337829589844, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.626953125, - "step": 611, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000166654586792 - }, - { - "episode": 9808, - "epoch": 0.05876502378641359, - "loss/policy_avg": 0.2722185552120209, - "lr": 9.608895705521472e-06, - "objective/entropy": -124.71205139160156, - "objective/kl": 38.9796257019043, - "objective/non_score_reward": -1.9489812850952148, - "objective/rlhf_reward": -5.395925498008728, - "objective/scores": 0.6, - "policy/approxkl_avg": 19.52260971069336, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.623046875, - "step": 612, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9988607168197632 - }, - { - "episode": 9824, - "epoch": 0.058860888425543136, - "loss/policy_avg": 0.7936792969703674, - "lr": 9.608256646216769e-06, - "objective/entropy": -150.9628448486328, - "objective/kl": 32.946922302246094, - "objective/non_score_reward": -1.6473462581634521, - "objective/rlhf_reward": -5.165553171833125, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 23.228769302368164, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.537109375, - "step": 613, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0001022815704346 - }, - { - "episode": 9840, - "epoch": 0.058956753064672685, - "loss/policy_avg": 0.8288295269012451, - "lr": 9.607617586912066e-06, - "objective/entropy": -145.37136840820312, - "objective/kl": 37.17048645019531, - "objective/non_score_reward": -1.8585245609283447, - "objective/rlhf_reward": -5.6092691376534205, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 8.95422077178955, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.568359375, - "step": 614, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9995349645614624 - }, - { - "episode": 9856, - "epoch": 0.05905261770380223, - "loss/policy_avg": 0.19199243187904358, - "lr": 9.606978527607363e-06, - "objective/entropy": -158.26043701171875, - "objective/kl": 31.016521453857422, - "objective/non_score_reward": -1.550826072692871, - "objective/rlhf_reward": -4.8440544244989585, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.4004452228546143, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5546875, - "step": 615, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00236439704895 - }, - { - "episode": 9872, - "epoch": 0.05914848234293178, - "loss/policy_avg": 0.29752206802368164, - "lr": 9.60633946830266e-06, - "objective/entropy": -141.43800354003906, - "objective/kl": 27.8808536529541, - "objective/non_score_reward": -1.394042730331421, - "objective/rlhf_reward": -3.842837558190028, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 10.629474639892578, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5078125, - "step": 616, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00028657913208 - }, - { - "episode": 9888, - "epoch": 0.05924434698206133, - "loss/policy_avg": 0.2227097749710083, - "lr": 9.605700408997955e-06, - "objective/entropy": -97.0810775756836, - "objective/kl": 34.3601188659668, - "objective/non_score_reward": -1.718005895614624, - "objective/rlhf_reward": -5.4481916024285235, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 16.432331085205078, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.548828125, - "step": 617, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9975783824920654 - }, - { - "episode": 9904, - "epoch": 0.05934021162119088, - "loss/policy_avg": 0.17975842952728271, - "lr": 9.605061349693252e-06, - "objective/entropy": -200.100830078125, - "objective/kl": 28.51620864868164, - "objective/non_score_reward": -1.4258103370666504, - "objective/rlhf_reward": -3.8784127190438022, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 10.591612815856934, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 618, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0000221729278564 - }, - { - "episode": 9920, - "epoch": 0.05943607626032043, - "loss/policy_avg": 0.4452857971191406, - "lr": 9.604422290388548e-06, - "objective/entropy": -87.9361572265625, - "objective/kl": 34.174217224121094, - "objective/non_score_reward": -1.7087109088897705, - "objective/rlhf_reward": -5.278584449496821, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 24.203800201416016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.587890625, - "step": 619, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9989922046661377 - }, - { - "episode": 9936, - "epoch": 0.05953194089944998, - "loss/policy_avg": 0.31785786151885986, - "lr": 9.603783231083845e-06, - "objective/entropy": -56.93491744995117, - "objective/kl": 34.28547286987305, - "objective/non_score_reward": -1.7142736911773682, - "objective/rlhf_reward": -5.032265897068094, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 12.636474609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.85546875, - "step": 620, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.996368408203125 - }, - { - "episode": 9952, - "epoch": 0.05962780553857953, - "loss/policy_avg": 0.6350647211074829, - "lr": 9.603144171779142e-06, - "objective/entropy": -129.3587188720703, - "objective/kl": 41.710655212402344, - "objective/non_score_reward": -2.0855326652526855, - "objective/rlhf_reward": -6.219424667135749, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 11.748146057128906, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4775390625, - "step": 621, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9979774951934814 - }, - { - "episode": 9968, - "epoch": 0.059723670177709076, - "loss/policy_avg": 0.9843254089355469, - "lr": 9.602505112474439e-06, - "objective/entropy": -95.34288024902344, - "objective/kl": 49.37370300292969, - "objective/non_score_reward": -2.4686851501464844, - "objective/rlhf_reward": -8.049912209781716, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 31.02006721496582, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4970703125, - "step": 622, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9965976476669312 - }, - { - "episode": 9984, - "epoch": 0.059819534816838625, - "loss/policy_avg": 0.6165390610694885, - "lr": 9.601866053169734e-06, - "objective/entropy": -100.56966400146484, - "objective/kl": 33.22990036010742, - "objective/non_score_reward": -1.6614950895309448, - "objective/rlhf_reward": -5.286730491851253, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 13.85442066192627, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.61328125, - "step": 623, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9971177577972412 - }, - { - "episode": 10000, - "epoch": 0.059915399455968174, - "loss/policy_avg": 0.3318287134170532, - "lr": 9.601226993865031e-06, - "objective/entropy": -212.1555938720703, - "objective/kl": 25.822668075561523, - "objective/non_score_reward": -1.2911334037780762, - "objective/rlhf_reward": -2.2408145412218303, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 1.2788864374160767, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.525390625, - "step": 624, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9995461702346802 - }, - { - "episode": 10016, - "epoch": 0.06001126409509772, - "loss/policy_avg": 0.35671815276145935, - "lr": 9.600587934560328e-06, - "objective/entropy": -96.60403442382812, - "objective/kl": 42.28247833251953, - "objective/non_score_reward": -2.114124059677124, - "objective/rlhf_reward": -6.6316679671135645, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 9.525958061218262, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.55859375, - "step": 625, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999858021736145 - }, - { - "episode": 10032, - "epoch": 0.06010712873422727, - "loss/policy_avg": 0.026430530473589897, - "lr": 9.599948875255625e-06, - "objective/entropy": -96.45112609863281, - "objective/kl": 30.055763244628906, - "objective/non_score_reward": -1.5027881860733032, - "objective/rlhf_reward": -4.56055448493515, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 6.234503746032715, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.521484375, - "step": 626, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.002078056335449 - }, - { - "episode": 10048, - "epoch": 0.06020299337335682, - "loss/policy_avg": -0.07770150899887085, - "lr": 9.599309815950922e-06, - "objective/entropy": -78.50785827636719, - "objective/kl": 33.19765090942383, - "objective/non_score_reward": -1.6598827838897705, - "objective/rlhf_reward": -5.158578279431223, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 60.745849609375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5546875, - "step": 627, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0026586055755615 - }, - { - "episode": 10064, - "epoch": 0.06029885801248637, - "loss/policy_avg": 0.045525066554546356, - "lr": 9.598670756646217e-06, - "objective/entropy": -207.98727416992188, - "objective/kl": 34.44676208496094, - "objective/non_score_reward": -1.7223379611968994, - "objective/rlhf_reward": -5.489351963996887, - "objective/scores": 0.35, - "policy/approxkl_avg": 2.952592372894287, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.671875, - "step": 628, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9989871978759766 - }, - { - "episode": 10080, - "epoch": 0.06039472265161592, - "loss/policy_avg": 0.32521092891693115, - "lr": 9.598031697341514e-06, - "objective/entropy": -71.00718688964844, - "objective/kl": 27.00582504272461, - "objective/non_score_reward": -1.3502912521362305, - "objective/rlhf_reward": -3.977332849701015, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 5.865281105041504, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7578125, - "step": 629, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001133918762207 - }, - { - "episode": 10096, - "epoch": 0.06049058729074547, - "loss/policy_avg": 0.22257700562477112, - "lr": 9.59739263803681e-06, - "objective/entropy": -87.40052795410156, - "objective/kl": 31.356922149658203, - "objective/non_score_reward": -1.5678460597991943, - "objective/rlhf_reward": -4.32397324867719, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 27.549453735351562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.546875, - "step": 630, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999577522277832 - }, - { - "episode": 10112, - "epoch": 0.06058645192987502, - "loss/policy_avg": 0.4591647982597351, - "lr": 9.596753578732108e-06, - "objective/entropy": -35.01010513305664, - "objective/kl": 28.93059539794922, - "objective/non_score_reward": -1.4465298652648926, - "objective/rlhf_reward": -4.42686941597311, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 10.006196975708008, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8203125, - "step": 631, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9968595504760742 - }, - { - "episode": 10128, - "epoch": 0.060682316569004566, - "loss/policy_avg": 0.9483177661895752, - "lr": 9.596114519427405e-06, - "objective/entropy": -152.91030883789062, - "objective/kl": 30.360069274902344, - "objective/non_score_reward": -1.5180034637451172, - "objective/rlhf_reward": -4.338680283228555, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 15.410400390625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.599609375, - "step": 632, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9943327903747559 - }, - { - "episode": 10144, - "epoch": 0.060778181208134115, - "loss/policy_avg": 0.4167541265487671, - "lr": 9.595475460122701e-06, - "objective/entropy": -154.04684448242188, - "objective/kl": 33.39550018310547, - "objective/non_score_reward": -1.6697750091552734, - "objective/rlhf_reward": -5.074980471197682, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 53.406578063964844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.576171875, - "step": 633, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9963878393173218 - }, - { - "episode": 10160, - "epoch": 0.060874045847263664, - "loss/policy_avg": -0.021846026182174683, - "lr": 9.594836400817997e-06, - "objective/entropy": -22.81509780883789, - "objective/kl": 23.709880828857422, - "objective/non_score_reward": -1.1854941844940186, - "objective/rlhf_reward": -2.917147810730051, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 0.839837908744812, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6953125, - "step": 634, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000304937362671 - }, - { - "episode": 10176, - "epoch": 0.06096991048639321, - "loss/policy_avg": 0.014755940064787865, - "lr": 9.594197341513293e-06, - "objective/entropy": -198.07839965820312, - "objective/kl": 21.79191017150879, - "objective/non_score_reward": -1.0895954370498657, - "objective/rlhf_reward": -1.9583818078041078, - "objective/scores": 0.6, - "policy/approxkl_avg": 0.6484163999557495, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.767578125, - "step": 635, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0002074241638184 - }, - { - "episode": 10192, - "epoch": 0.06106577512552276, - "loss/policy_avg": 0.13533297181129456, - "lr": 9.593558282208589e-06, - "objective/entropy": -201.26246643066406, - "objective/kl": 26.135250091552734, - "objective/non_score_reward": -1.3067626953125, - "objective/rlhf_reward": -3.885414889364868, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 11.92165756225586, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.740234375, - "step": 636, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9993889331817627 - }, - { - "episode": 10208, - "epoch": 0.06116163976465231, - "loss/policy_avg": 0.4021642506122589, - "lr": 9.592919222903886e-06, - "objective/entropy": -286.0339050292969, - "objective/kl": 14.542181968688965, - "objective/non_score_reward": -0.7271090745925903, - "objective/rlhf_reward": -1.484604258735744, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 5.031335353851318, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.701171875, - "step": 637, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.003591775894165 - }, - { - "episode": 10224, - "epoch": 0.06125750440378186, - "loss/policy_avg": 0.2514651417732239, - "lr": 9.592280163599182e-06, - "objective/entropy": -132.75355529785156, - "objective/kl": 25.25128173828125, - "objective/non_score_reward": -1.2625641822814941, - "objective/rlhf_reward": -3.5996581717446894, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 14.74315071105957, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.712890625, - "step": 638, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000126361846924 - }, - { - "episode": 10240, - "epoch": 0.06135336904291141, - "loss/policy_avg": 0.012995198369026184, - "lr": 9.59164110429448e-06, - "objective/entropy": -181.2290496826172, - "objective/kl": 22.253154754638672, - "objective/non_score_reward": -1.1126577854156494, - "objective/rlhf_reward": -3.026798923214046, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 0.9591898918151855, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53515625, - "step": 639, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993176460266113 - }, - { - "episode": 10256, - "epoch": 0.06144923368204096, - "loss/policy_avg": 0.15271592140197754, - "lr": 9.591002044989776e-06, - "objective/entropy": -105.57412719726562, - "objective/kl": 38.59171676635742, - "objective/non_score_reward": -1.9295859336853027, - "objective/rlhf_reward": -6.16208431026037, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 6.626259803771973, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.734375, - "step": 640, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.996435284614563 - }, - { - "episode": 10272, - "epoch": 0.061545098321170506, - "loss/policy_avg": -0.11524446308612823, - "lr": 9.590362985685071e-06, - "objective/entropy": -123.53447723388672, - "objective/kl": 26.7266845703125, - "objective/non_score_reward": -1.336334228515625, - "objective/rlhf_reward": -3.222630920187507, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 1.8472533226013184, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.46484375, - "step": 641, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.011606216430664 - }, - { - "episode": 10288, - "epoch": 0.061640962960300055, - "loss/policy_avg": 0.4013972282409668, - "lr": 9.589723926380368e-06, - "objective/entropy": -128.90103149414062, - "objective/kl": 31.007064819335938, - "objective/non_score_reward": -1.5503532886505127, - "objective/rlhf_reward": -4.685641431602177, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 6.671117782592773, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.556640625, - "step": 642, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9970167875289917 - }, - { - "episode": 10304, - "epoch": 0.061736827599429604, - "loss/policy_avg": 0.7907944321632385, - "lr": 9.589084867075665e-06, - "objective/entropy": -58.220497131347656, - "objective/kl": 41.770606994628906, - "objective/non_score_reward": -2.0885305404663086, - "objective/rlhf_reward": -6.620788232485452, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 17.74094581604004, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.466796875, - "step": 643, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.995949149131775 - }, - { - "episode": 10320, - "epoch": 0.06183269223855915, - "loss/policy_avg": 0.017528323456645012, - "lr": 9.588445807770962e-06, - "objective/entropy": -208.79119873046875, - "objective/kl": 23.041034698486328, - "objective/non_score_reward": -1.1520518064498901, - "objective/rlhf_reward": -3.092435383590397, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 1.83624267578125, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.732421875, - "step": 644, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0010547637939453 - }, - { - "episode": 10336, - "epoch": 0.0619285568776887, - "loss/policy_avg": 0.15500307083129883, - "lr": 9.587806748466259e-06, - "objective/entropy": -124.78570556640625, - "objective/kl": 34.243202209472656, - "objective/non_score_reward": -1.7121602296829224, - "objective/rlhf_reward": -3.92492190444586, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 2.4558181762695312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5078125, - "step": 645, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9997308254241943 - }, - { - "episode": 10352, - "epoch": 0.06202442151681825, - "loss/policy_avg": 0.2161247283220291, - "lr": 9.587167689161556e-06, - "objective/entropy": -163.63064575195312, - "objective/kl": 25.873336791992188, - "objective/non_score_reward": -1.293666958808899, - "objective/rlhf_reward": -3.7960657263673365, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 8.89102840423584, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5234375, - "step": 646, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998837947845459 - }, - { - "episode": 10368, - "epoch": 0.0621202861559478, - "loss/policy_avg": 0.08966261148452759, - "lr": 9.586528629856851e-06, - "objective/entropy": -104.2444076538086, - "objective/kl": 33.29509735107422, - "objective/non_score_reward": -1.664754867553711, - "objective/rlhf_reward": -4.925686256090799, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 1.3677499294281006, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 647, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993959665298462 - }, - { - "episode": 10384, - "epoch": 0.06221615079507735, - "loss/policy_avg": -0.02724701538681984, - "lr": 9.585889570552148e-06, - "objective/entropy": -133.99429321289062, - "objective/kl": 27.543067932128906, - "objective/non_score_reward": -1.3771533966064453, - "objective/rlhf_reward": -3.1086136460304257, - "objective/scores": 0.6, - "policy/approxkl_avg": 7.215035438537598, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.677734375, - "step": 648, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986003637313843 - }, - { - "episode": 10400, - "epoch": 0.0623120154342069, - "loss/policy_avg": -0.23539991676807404, - "lr": 9.585250511247445e-06, - "objective/entropy": -167.906494140625, - "objective/kl": 25.879772186279297, - "objective/non_score_reward": -1.293988585472107, - "objective/rlhf_reward": -3.571834478441792, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 2.0954341888427734, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.57421875, - "step": 649, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9997683763504028 - }, - { - "episode": 10416, - "epoch": 0.06240788007333645, - "loss/policy_avg": 0.30569222569465637, - "lr": 9.584611451942742e-06, - "objective/entropy": -226.60678100585938, - "objective/kl": 28.675113677978516, - "objective/non_score_reward": -1.433755874633789, - "objective/rlhf_reward": -3.7876121503877, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 52.77922058105469, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6484375, - "step": 650, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9980084896087646 - }, - { - "episode": 10432, - "epoch": 0.062503744712466, - "loss/policy_avg": -0.24214023351669312, - "lr": 9.583972392638038e-06, - "objective/entropy": -121.17498779296875, - "objective/kl": 38.84062957763672, - "objective/non_score_reward": -1.9420316219329834, - "objective/rlhf_reward": -5.820715139584477, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 1.8967432975769043, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.623046875, - "step": 651, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0004329681396484 - }, - { - "episode": 10448, - "epoch": 0.06259960935159554, - "loss/policy_avg": -0.3156575858592987, - "lr": 9.583333333333335e-06, - "objective/entropy": -146.38143920898438, - "objective/kl": 32.020687103271484, - "objective/non_score_reward": -1.60103440284729, - "objective/rlhf_reward": -5.062502017527251, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 2.199296236038208, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.642578125, - "step": 652, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0025553703308105 - }, - { - "episode": 10464, - "epoch": 0.0626954739907251, - "loss/policy_avg": 0.07271748781204224, - "lr": 9.58269427402863e-06, - "objective/entropy": -196.48562622070312, - "objective/kl": 28.001068115234375, - "objective/non_score_reward": -1.4000535011291504, - "objective/rlhf_reward": -4.2002141833305355, - "objective/scores": 0.35, - "policy/approxkl_avg": 24.475753784179688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6875, - "step": 653, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0010745525360107 - }, - { - "episode": 10480, - "epoch": 0.06279133862985464, - "loss/policy_avg": 0.17373695969581604, - "lr": 9.582055214723927e-06, - "objective/entropy": -275.5335388183594, - "objective/kl": 27.79926300048828, - "objective/non_score_reward": -1.3899632692337036, - "objective/rlhf_reward": -5.5598530769348145, - "objective/scores": 0.0, - "policy/approxkl_avg": 17.22200584411621, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.615234375, - "step": 654, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9987027645111084 - }, - { - "episode": 10496, - "epoch": 0.0628872032689842, - "loss/policy_avg": 0.15186084806919098, - "lr": 9.581416155419224e-06, - "objective/entropy": -197.2568817138672, - "objective/kl": 23.105377197265625, - "objective/non_score_reward": -1.1552690267562866, - "objective/rlhf_reward": -2.796247239383768, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 35.64599609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7734375, - "step": 655, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985601902008057 - }, - { - "episode": 10512, - "epoch": 0.06298306790811374, - "loss/policy_avg": 0.09821736067533493, - "lr": 9.58077709611452e-06, - "objective/entropy": -192.20767211914062, - "objective/kl": 28.659635543823242, - "objective/non_score_reward": -1.4329817295074463, - "objective/rlhf_reward": -4.070067649305449, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 5.6847333908081055, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.712890625, - "step": 656, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9977080821990967 - }, - { - "episode": 10528, - "epoch": 0.0630789325472433, - "loss/policy_avg": 0.24115119874477386, - "lr": 9.580138036809816e-06, - "objective/entropy": -171.08619689941406, - "objective/kl": 26.453920364379883, - "objective/non_score_reward": -1.3226962089538574, - "objective/rlhf_reward": -3.8907844781875607, - "objective/scores": 0.35, - "policy/approxkl_avg": 11.276920318603516, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6875, - "step": 657, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999925136566162 - }, - { - "episode": 10544, - "epoch": 0.06317479718637284, - "loss/policy_avg": -0.04878993332386017, - "lr": 9.579498977505113e-06, - "objective/entropy": -95.69158172607422, - "objective/kl": 26.445575714111328, - "objective/non_score_reward": -1.3222787380218506, - "objective/rlhf_reward": -3.94747917941156, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 5.285589218139648, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.625, - "step": 658, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0013298988342285 - }, - { - "episode": 10560, - "epoch": 0.0632706618255024, - "loss/policy_avg": -0.10105658322572708, - "lr": 9.57885991820041e-06, - "objective/entropy": -209.01065063476562, - "objective/kl": 27.234224319458008, - "objective/non_score_reward": -1.3617112636566162, - "objective/rlhf_reward": -4.046844816207885, - "objective/scores": 0.35, - "policy/approxkl_avg": 2.436962366104126, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6328125, - "step": 659, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001246690750122 - }, - { - "episode": 10576, - "epoch": 0.06336652646463194, - "loss/policy_avg": -0.3218346834182739, - "lr": 9.578220858895705e-06, - "objective/entropy": -3.9748001098632812, - "objective/kl": 18.186880111694336, - "objective/non_score_reward": -0.9093440771102905, - "objective/rlhf_reward": -1.5146698824324945, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 28.07345962524414, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.8359375, - "step": 660, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0002238750457764 - }, - { - "episode": 10592, - "epoch": 0.06346239110376149, - "loss/policy_avg": -0.19762462377548218, - "lr": 9.577581799591002e-06, - "objective/entropy": -204.72760009765625, - "objective/kl": 18.785112380981445, - "objective/non_score_reward": -0.9392555356025696, - "objective/rlhf_reward": -1.6343160293259955, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 1.8940598964691162, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.560546875, - "step": 661, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0025925636291504 - }, - { - "episode": 10608, - "epoch": 0.06355825574289103, - "loss/policy_avg": -0.45743584632873535, - "lr": 9.576942740286299e-06, - "objective/entropy": -134.4844970703125, - "objective/kl": 33.7373046875, - "objective/non_score_reward": -1.6868653297424316, - "objective/rlhf_reward": -5.296863298030242, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 2.153486967086792, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.521484375, - "step": 662, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.00048828125 - }, - { - "episode": 10624, - "epoch": 0.06365412038202059, - "loss/policy_avg": 0.2565079629421234, - "lr": 9.576303680981596e-06, - "objective/entropy": -180.13528442382812, - "objective/kl": 17.24534034729004, - "objective/non_score_reward": -0.862267017364502, - "objective/rlhf_reward": -2.089818143580837, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 7.433453559875488, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.626953125, - "step": 663, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9994475841522217 - }, - { - "episode": 10640, - "epoch": 0.06374998502115013, - "loss/policy_avg": 0.17452527582645416, - "lr": 9.575664621676893e-06, - "objective/entropy": -64.2728271484375, - "objective/kl": 21.405649185180664, - "objective/non_score_reward": -1.0702824592590332, - "objective/rlhf_reward": -2.9218800303682517, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.6351606845855713, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.71484375, - "step": 664, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0012965202331543 - }, - { - "episode": 10656, - "epoch": 0.06384584966027969, - "loss/policy_avg": 0.6966801881790161, - "lr": 9.57502556237219e-06, - "objective/entropy": -251.04238891601562, - "objective/kl": 27.693851470947266, - "objective/non_score_reward": -1.384692668914795, - "objective/rlhf_reward": -3.934650454584675, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 10.390886306762695, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.5859375, - "step": 665, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001401662826538 - }, - { - "episode": 10672, - "epoch": 0.06394171429940923, - "loss/policy_avg": 0.16458481550216675, - "lr": 9.574386503067485e-06, - "objective/entropy": -219.99136352539062, - "objective/kl": 13.308931350708008, - "objective/non_score_reward": -0.6654465198516846, - "objective/rlhf_reward": -0.7143749100732166, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 3.77976131439209, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69140625, - "step": 666, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000656843185425 - }, - { - "episode": 10688, - "epoch": 0.06403757893853879, - "loss/policy_avg": -0.009436726570129395, - "lr": 9.573747443762782e-06, - "objective/entropy": -162.25047302246094, - "objective/kl": 23.977962493896484, - "objective/non_score_reward": -1.1988980770111084, - "objective/rlhf_reward": -2.8481810791062667, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 22.450942993164062, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.763671875, - "step": 667, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0019123554229736 - }, - { - "episode": 10704, - "epoch": 0.06413344357766833, - "loss/policy_avg": 0.4135128855705261, - "lr": 9.573108384458079e-06, - "objective/entropy": -63.0797119140625, - "objective/kl": 41.37904739379883, - "objective/non_score_reward": -2.0689523220062256, - "objective/rlhf_reward": -6.542475895086923, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 88.98745727539062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.779296875, - "step": 668, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9999923706054688 - }, - { - "episode": 10720, - "epoch": 0.06422930821679788, - "loss/policy_avg": 0.6821532845497131, - "lr": 9.572469325153375e-06, - "objective/entropy": -196.7287139892578, - "objective/kl": 30.88260269165039, - "objective/non_score_reward": -1.5441300868988037, - "objective/rlhf_reward": -4.660748505386051, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 23.963293075561523, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.78515625, - "step": 669, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9989004135131836 - }, - { - "episode": 10736, - "epoch": 0.06432517285592743, - "loss/policy_avg": 0.3629915118217468, - "lr": 9.571830265848672e-06, - "objective/entropy": -205.541259765625, - "objective/kl": 24.442432403564453, - "objective/non_score_reward": -1.2221217155456543, - "objective/rlhf_reward": -3.155153171221415, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 15.010305404663086, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.63671875, - "step": 670, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9991261959075928 - }, - { - "episode": 10752, - "epoch": 0.06442103749505698, - "loss/policy_avg": 0.3024546504020691, - "lr": 9.571191206543968e-06, - "objective/entropy": -184.0182647705078, - "objective/kl": 28.46197509765625, - "objective/non_score_reward": -1.4230988025665283, - "objective/rlhf_reward": -3.744983862118657, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.1509013175964355, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.611328125, - "step": 671, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998319149017334 - }, - { - "episode": 10768, - "epoch": 0.06451690213418652, - "loss/policy_avg": -0.12359270453453064, - "lr": 9.570552147239264e-06, - "objective/entropy": -107.1251220703125, - "objective/kl": 24.85216522216797, - "objective/non_score_reward": -1.2426085472106934, - "objective/rlhf_reward": -3.611183964942379, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.815180540084839, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.62890625, - "step": 672, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.002232074737549 - }, - { - "episode": 10784, - "epoch": 0.06461276677331608, - "loss/policy_avg": 0.3783743977546692, - "lr": 9.569913087934561e-06, - "objective/entropy": -155.0634765625, - "objective/kl": 33.26643371582031, - "objective/non_score_reward": -1.663321852684021, - "objective/rlhf_reward": -5.294037544463558, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 7.487679958343506, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.751953125, - "step": 673, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9973864555358887 - }, - { - "episode": 10800, - "epoch": 0.06470863141244562, - "loss/policy_avg": 0.12491178512573242, - "lr": 9.569274028629858e-06, - "objective/entropy": -202.8880157470703, - "objective/kl": 23.53227996826172, - "objective/non_score_reward": -1.1766140460968018, - "objective/rlhf_reward": -2.9731229106585184, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 5.709697246551514, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.576171875, - "step": 674, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9979543685913086 - }, - { - "episode": 10816, - "epoch": 0.06480449605157518, - "loss/policy_avg": -0.01751142367720604, - "lr": 9.568634969325155e-06, - "objective/entropy": -217.27896118164062, - "objective/kl": 27.020957946777344, - "objective/non_score_reward": -1.3510478734970093, - "objective/rlhf_reward": -3.4567805034684493, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 0.6378078460693359, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.63671875, - "step": 675, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0012476444244385 - }, - { - "episode": 10832, - "epoch": 0.06490036069070472, - "loss/policy_avg": 0.28126630187034607, - "lr": 9.567995910020452e-06, - "objective/entropy": -230.15963745117188, - "objective/kl": 24.95879364013672, - "objective/non_score_reward": -1.2479398250579834, - "objective/rlhf_reward": -3.329899912298308, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 10.301782608032227, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.744140625, - "step": 676, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9993771314620972 - }, - { - "episode": 10848, - "epoch": 0.06499622532983428, - "loss/policy_avg": 0.12287623435258865, - "lr": 9.567356850715747e-06, - "objective/entropy": -263.37542724609375, - "objective/kl": 23.937744140625, - "objective/non_score_reward": -1.1968872547149658, - "objective/rlhf_reward": -0.3875493764877316, - "objective/scores": 1.1, - "policy/approxkl_avg": 45.05952453613281, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.703125, - "step": 677, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9995876550674438 - }, - { - "episode": 10864, - "epoch": 0.06509208996896382, - "loss/policy_avg": 0.6470179557800293, - "lr": 9.566717791411044e-06, - "objective/entropy": -65.45881652832031, - "objective/kl": 23.807559967041016, - "objective/non_score_reward": -1.190378189086914, - "objective/rlhf_reward": -3.419877028375297, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 10.65350341796875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.744140625, - "step": 678, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999696969985962 - }, - { - "episode": 10880, - "epoch": 0.06518795460809337, - "loss/policy_avg": 0.2790781855583191, - "lr": 9.56607873210634e-06, - "objective/entropy": -161.4605712890625, - "objective/kl": 41.620460510253906, - "objective/non_score_reward": -2.0810232162475586, - "objective/rlhf_reward": -3.924092388153076, - "objective/scores": 1.1, - "policy/approxkl_avg": 5.482306480407715, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.57421875, - "step": 679, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986392259597778 - }, - { - "episode": 10896, - "epoch": 0.06528381924722292, - "loss/policy_avg": 0.042992569506168365, - "lr": 9.565439672801636e-06, - "objective/entropy": -162.92010498046875, - "objective/kl": 26.902143478393555, - "objective/non_score_reward": -1.3451071977615356, - "objective/rlhf_reward": -4.001826503363949, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 4.27599573135376, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.654296875, - "step": 680, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998090147972107 - }, - { - "episode": 10912, - "epoch": 0.06537968388635247, - "loss/policy_avg": 0.20157073438167572, - "lr": 9.564800613496933e-06, - "objective/entropy": -265.3901672363281, - "objective/kl": 29.956632614135742, - "objective/non_score_reward": -1.4978315830230713, - "objective/rlhf_reward": -3.868620397821937, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 68.22042846679688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.609375, - "step": 681, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982101917266846 - }, - { - "episode": 10928, - "epoch": 0.06547554852548201, - "loss/policy_avg": 1.519484281539917, - "lr": 9.56416155419223e-06, - "objective/entropy": -127.62720489501953, - "objective/kl": 23.382505416870117, - "objective/non_score_reward": -1.1691253185272217, - "objective/rlhf_reward": -2.2765009164810177, - "objective/scores": 0.6, - "policy/approxkl_avg": 17.878856658935547, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.419921875, - "step": 682, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9984099864959717 - }, - { - "episode": 10944, - "epoch": 0.06557141316461157, - "loss/policy_avg": 0.3158057928085327, - "lr": 9.563522494887527e-06, - "objective/entropy": -190.45260620117188, - "objective/kl": 25.518230438232422, - "objective/non_score_reward": -1.275911569595337, - "objective/rlhf_reward": -3.622693660672068, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 34.12330627441406, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.63671875, - "step": 683, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000800371170044 - }, - { - "episode": 10960, - "epoch": 0.06566727780374111, - "loss/policy_avg": 1.1294161081314087, - "lr": 9.562883435582822e-06, - "objective/entropy": -107.20721435546875, - "objective/kl": 32.379913330078125, - "objective/non_score_reward": -1.6189957857131958, - "objective/rlhf_reward": -5.13434737017694, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 7.272080421447754, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5234375, - "step": 684, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998885154724121 - }, - { - "episode": 10976, - "epoch": 0.06576314244287067, - "loss/policy_avg": 0.44281357526779175, - "lr": 9.562244376278119e-06, - "objective/entropy": -128.0640869140625, - "objective/kl": 20.03044891357422, - "objective/non_score_reward": -1.0015225410461426, - "objective/rlhf_reward": -1.082371120096418, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 12.73418140411377, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.634765625, - "step": 685, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999821424484253 - }, - { - "episode": 10992, - "epoch": 0.06585900708200021, - "loss/policy_avg": 0.2683737576007843, - "lr": 9.561605316973416e-06, - "objective/entropy": -258.8201904296875, - "objective/kl": 27.295347213745117, - "objective/non_score_reward": -1.3647674322128296, - "objective/rlhf_reward": -2.535350595356199, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 5.86362886428833, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.71875, - "step": 686, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9981340169906616 - }, - { - "episode": 11008, - "epoch": 0.06595487172112977, - "loss/policy_avg": -0.14624132215976715, - "lr": 9.560966257668713e-06, - "objective/entropy": -96.99462890625, - "objective/kl": 30.466350555419922, - "objective/non_score_reward": -1.523317575454712, - "objective/rlhf_reward": -4.57749816158646, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 8.779112815856934, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.494140625, - "step": 687, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9981212615966797 - }, - { - "episode": 11024, - "epoch": 0.06605073636025931, - "loss/policy_avg": 0.12842759490013123, - "lr": 9.56032719836401e-06, - "objective/entropy": -166.20689392089844, - "objective/kl": 26.250516891479492, - "objective/non_score_reward": -1.312525749206543, - "objective/rlhf_reward": -2.8501029968261715, - "objective/scores": 0.6, - "policy/approxkl_avg": 7.160890102386475, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5703125, - "step": 688, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9990025758743286 - }, - { - "episode": 11040, - "epoch": 0.06614660099938886, - "loss/policy_avg": 0.2923339009284973, - "lr": 9.559688139059306e-06, - "objective/entropy": -236.72100830078125, - "objective/kl": 33.81795883178711, - "objective/non_score_reward": -1.6908979415893555, - "objective/rlhf_reward": -5.4219562321001575, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 16.3193359375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.650390625, - "step": 689, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.99735426902771 - }, - { - "episode": 11056, - "epoch": 0.0662424656385184, - "loss/policy_avg": -0.10266150534152985, - "lr": 9.559049079754601e-06, - "objective/entropy": -85.62126159667969, - "objective/kl": 31.331233978271484, - "objective/non_score_reward": -1.5665616989135742, - "objective/rlhf_reward": -4.143540324942146, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 6.518294811248779, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.794921875, - "step": 690, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.006378650665283 - }, - { - "episode": 11072, - "epoch": 0.06633833027764796, - "loss/policy_avg": 0.17208513617515564, - "lr": 9.558410020449898e-06, - "objective/entropy": -175.00662231445312, - "objective/kl": 33.992698669433594, - "objective/non_score_reward": -1.6996350288391113, - "objective/rlhf_reward": -5.4392902490839194, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 32.03794860839844, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.716796875, - "step": 691, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989235401153564 - }, - { - "episode": 11088, - "epoch": 0.06643419491677752, - "loss/policy_avg": 0.01335047371685505, - "lr": 9.557770961145195e-06, - "objective/entropy": -248.65049743652344, - "objective/kl": 22.41885757446289, - "objective/non_score_reward": -1.1209429502487183, - "objective/rlhf_reward": -2.536360512452062, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.7352328300476074, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.673828125, - "step": 692, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0011818408966064 - }, - { - "episode": 11104, - "epoch": 0.06653005955590706, - "loss/policy_avg": 0.14417897164821625, - "lr": 9.557131901840492e-06, - "objective/entropy": -218.454345703125, - "objective/kl": 15.86509895324707, - "objective/non_score_reward": -0.7932549715042114, - "objective/rlhf_reward": 1.226980143785477, - "objective/scores": 1.1, - "policy/approxkl_avg": 1.0328912734985352, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.640625, - "step": 693, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0003151893615723 - }, - { - "episode": 11120, - "epoch": 0.06662592419503661, - "loss/policy_avg": 0.09597369283437729, - "lr": 9.556492842535789e-06, - "objective/entropy": -175.68487548828125, - "objective/kl": 32.48929977416992, - "objective/non_score_reward": -1.624464988708496, - "objective/rlhf_reward": -2.0978601336479183, - "objective/scores": 1.1, - "policy/approxkl_avg": 3.689056396484375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.705078125, - "step": 694, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998678207397461 - }, - { - "episode": 11136, - "epoch": 0.06672178883416616, - "loss/policy_avg": -0.004386359825730324, - "lr": 9.555853783231084e-06, - "objective/entropy": 122.54474639892578, - "objective/kl": 42.134315490722656, - "objective/non_score_reward": -2.106715679168701, - "objective/rlhf_reward": -6.822743091646748, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 6.307683944702148, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7421875, - "step": 695, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999419450759888 - }, - { - "episode": 11152, - "epoch": 0.06681765347329571, - "loss/policy_avg": 0.3615373373031616, - "lr": 9.555214723926381e-06, - "objective/entropy": -260.84075927734375, - "objective/kl": 35.725467681884766, - "objective/non_score_reward": -1.7862732410430908, - "objective/rlhf_reward": -5.664140108044505, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 45.438873291015625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.685546875, - "step": 696, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.996116280555725 - }, - { - "episode": 11168, - "epoch": 0.06691351811242525, - "loss/policy_avg": 0.24602335691452026, - "lr": 9.554575664621678e-06, - "objective/entropy": -71.92741394042969, - "objective/kl": 30.083784103393555, - "objective/non_score_reward": -1.5041892528533936, - "objective/rlhf_reward": -4.6575073835596275, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 5.438946723937988, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4765625, - "step": 697, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998178482055664 - }, - { - "episode": 11184, - "epoch": 0.06700938275155481, - "loss/policy_avg": 0.034039177000522614, - "lr": 9.553936605316975e-06, - "objective/entropy": -198.67774963378906, - "objective/kl": 23.375925064086914, - "objective/non_score_reward": -1.1687963008880615, - "objective/rlhf_reward": -1.7514660700571265, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 0.5530495643615723, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.53125, - "step": 698, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00034236907959 - }, - { - "episode": 11200, - "epoch": 0.06710524739068435, - "loss/policy_avg": 0.5306535959243774, - "lr": 9.553297546012272e-06, - "objective/entropy": -143.43771362304688, - "objective/kl": 35.411888122558594, - "objective/non_score_reward": -1.7705943584442139, - "objective/rlhf_reward": -5.63177965125595, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 6.416120529174805, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.66796875, - "step": 699, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994237422943115 - }, - { - "episode": 11216, - "epoch": 0.06720111202981391, - "loss/policy_avg": 0.2092888504266739, - "lr": 9.552658486707569e-06, - "objective/entropy": -169.036376953125, - "objective/kl": 30.64543914794922, - "objective/non_score_reward": -1.5322721004486084, - "objective/rlhf_reward": -1.7290880441665646, - "objective/scores": 1.1, - "policy/approxkl_avg": 132.6121063232422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.564453125, - "step": 700, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9992992877960205 - }, - { - "episode": 11232, - "epoch": 0.06729697666894345, - "loss/policy_avg": 0.2553282380104065, - "lr": 9.552019427402864e-06, - "objective/entropy": -145.8370361328125, - "objective/kl": 31.58509063720703, - "objective/non_score_reward": -1.5792546272277832, - "objective/rlhf_reward": -4.760759084430292, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 23.342622756958008, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.712890625, - "step": 701, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0007388591766357 - }, - { - "episode": 11248, - "epoch": 0.067392841308073, - "loss/policy_avg": 0.1272473782300949, - "lr": 9.55138036809816e-06, - "objective/entropy": -283.0919494628906, - "objective/kl": 18.825233459472656, - "objective/non_score_reward": -0.9412617683410645, - "objective/rlhf_reward": -2.4057970878824424, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.5947492122650146, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70703125, - "step": 702, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999638557434082 - }, - { - "episode": 11264, - "epoch": 0.06748870594720255, - "loss/policy_avg": 0.2034430205821991, - "lr": 9.550741308793456e-06, - "objective/entropy": -274.40478515625, - "objective/kl": 20.724695205688477, - "objective/non_score_reward": -1.0362348556518555, - "objective/rlhf_reward": -1.221220110298368, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 3.738941192626953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.61328125, - "step": 703, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997761607170105 - }, - { - "episode": 11280, - "epoch": 0.0675845705863321, - "loss/policy_avg": 0.7114033699035645, - "lr": 9.550102249488753e-06, - "objective/entropy": -135.6627960205078, - "objective/kl": 27.718311309814453, - "objective/non_score_reward": -1.3859155178070068, - "objective/rlhf_reward": -3.5962508422898605, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 32.94233703613281, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.64453125, - "step": 704, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985225200653076 - }, - { - "episode": 11296, - "epoch": 0.06768043522546165, - "loss/policy_avg": -0.08856553584337234, - "lr": 9.54946319018405e-06, - "objective/entropy": -172.419921875, - "objective/kl": 31.078826904296875, - "objective/non_score_reward": -1.553941249847412, - "objective/rlhf_reward": -4.765167097659454, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 27.00151824951172, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.76171875, - "step": 705, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.003095865249634 - }, - { - "episode": 11312, - "epoch": 0.0677762998645912, - "loss/policy_avg": -0.1016867533326149, - "lr": 9.548824130879346e-06, - "objective/entropy": -186.52476501464844, - "objective/kl": 30.371601104736328, - "objective/non_score_reward": -1.5185801982879639, - "objective/rlhf_reward": -4.593368175442576, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 7.805020332336426, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.68359375, - "step": 706, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0018138885498047 - }, - { - "episode": 11328, - "epoch": 0.06787216450372074, - "loss/policy_avg": 0.3950710892677307, - "lr": 9.548185071574643e-06, - "objective/entropy": -169.30099487304688, - "objective/kl": 26.604206085205078, - "objective/non_score_reward": -1.3302103281021118, - "objective/rlhf_reward": -3.9422390843308985, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 3.9309802055358887, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.650390625, - "step": 707, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0003857612609863 - }, - { - "episode": 11344, - "epoch": 0.0679680291428503, - "loss/policy_avg": 0.15957045555114746, - "lr": 9.547546012269938e-06, - "objective/entropy": -152.48211669921875, - "objective/kl": 28.93355941772461, - "objective/non_score_reward": -1.4466780424118042, - "objective/rlhf_reward": -4.124852543295012, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 30.355663299560547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.728515625, - "step": 708, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9971683025360107 - }, - { - "episode": 11360, - "epoch": 0.06806389378197984, - "loss/policy_avg": 0.1635814905166626, - "lr": 9.546906952965235e-06, - "objective/entropy": -225.05284118652344, - "objective/kl": 32.07009506225586, - "objective/non_score_reward": -1.6035047769546509, - "objective/rlhf_reward": -5.088506314784212, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 25.63396453857422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.708984375, - "step": 709, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9975805282592773 - }, - { - "episode": 11376, - "epoch": 0.0681597584211094, - "loss/policy_avg": 0.22918304800987244, - "lr": 9.546267893660532e-06, - "objective/entropy": -245.11099243164062, - "objective/kl": 31.21074867248535, - "objective/non_score_reward": -1.560537576675415, - "objective/rlhf_reward": -4.5802904419308765, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 14.6522216796875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.658203125, - "step": 710, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9971773624420166 - }, - { - "episode": 11392, - "epoch": 0.06825562306023894, - "loss/policy_avg": -0.15267148613929749, - "lr": 9.545628834355829e-06, - "objective/entropy": -26.006134033203125, - "objective/kl": 25.76430320739746, - "objective/non_score_reward": -1.288215160369873, - "objective/rlhf_reward": -3.2054496509599044, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.9515511989593506, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.75, - "step": 711, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0021400451660156 - }, - { - "episode": 11408, - "epoch": 0.0683514876993685, - "loss/policy_avg": 0.03201477974653244, - "lr": 9.544989775051126e-06, - "objective/entropy": -229.9574737548828, - "objective/kl": 31.691633224487305, - "objective/non_score_reward": -1.5845816135406494, - "objective/rlhf_reward": -4.887728492827758, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 81.25225830078125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.74609375, - "step": 712, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0012998580932617 - }, - { - "episode": 11424, - "epoch": 0.06844735233849804, - "loss/policy_avg": 0.5598920583724976, - "lr": 9.544350715746423e-06, - "objective/entropy": -198.39407348632812, - "objective/kl": 22.02547264099121, - "objective/non_score_reward": -1.1012736558914185, - "objective/rlhf_reward": -3.045844846700115, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 7.494403839111328, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6640625, - "step": 713, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0001235008239746 - }, - { - "episode": 11440, - "epoch": 0.0685432169776276, - "loss/policy_avg": 0.14270013570785522, - "lr": 9.543711656441718e-06, - "objective/entropy": -281.67730712890625, - "objective/kl": 30.167518615722656, - "objective/non_score_reward": -1.5083760023117065, - "objective/rlhf_reward": -4.517732465060886, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 42.272212982177734, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6171875, - "step": 714, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9981521368026733 - }, - { - "episode": 11456, - "epoch": 0.06863908161675714, - "loss/policy_avg": 0.23854002356529236, - "lr": 9.543072597137015e-06, - "objective/entropy": -205.70501708984375, - "objective/kl": 26.037616729736328, - "objective/non_score_reward": -1.3018807172775269, - "objective/rlhf_reward": -3.603402886454182, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 21.1671085357666, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.736328125, - "step": 715, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999593734741211 - }, - { - "episode": 11472, - "epoch": 0.06873494625588669, - "loss/policy_avg": 0.25810641050338745, - "lr": 9.542433537832312e-06, - "objective/entropy": -202.4583740234375, - "objective/kl": 26.777297973632812, - "objective/non_score_reward": -1.338865041732788, - "objective/rlhf_reward": -3.7513400054612926, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 4.448478698730469, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7890625, - "step": 716, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999415636062622 - }, - { - "episode": 11488, - "epoch": 0.06883081089501623, - "loss/policy_avg": 0.16866181790828705, - "lr": 9.541794478527609e-06, - "objective/entropy": -174.37855529785156, - "objective/kl": 34.941444396972656, - "objective/non_score_reward": -1.7470722198486328, - "objective/rlhf_reward": -5.43202957412298, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 0.9149700403213501, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69140625, - "step": 717, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.000814437866211 - }, - { - "episode": 11504, - "epoch": 0.06892667553414579, - "loss/policy_avg": 0.20718123018741608, - "lr": 9.541155419222906e-06, - "objective/entropy": -75.93595123291016, - "objective/kl": 37.52787780761719, - "objective/non_score_reward": -1.8763937950134277, - "objective/rlhf_reward": -6.024622860367655, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 3.859286308288574, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5859375, - "step": 718, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9990894794464111 - }, - { - "episode": 11520, - "epoch": 0.06902254017327533, - "loss/policy_avg": -0.14078834652900696, - "lr": 9.5405163599182e-06, - "objective/entropy": -111.06301879882812, - "objective/kl": 37.833980560302734, - "objective/non_score_reward": -1.8916990756988525, - "objective/rlhf_reward": -5.44408971287397, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 1.0138969421386719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.646484375, - "step": 719, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0016212463378906 - }, - { - "episode": 11536, - "epoch": 0.06911840481240489, - "loss/policy_avg": -0.02326921373605728, - "lr": 9.539877300613498e-06, - "objective/entropy": -7.474525451660156, - "objective/kl": 37.21611785888672, - "objective/non_score_reward": -1.860805869102478, - "objective/rlhf_reward": -7.443223357200623, - "objective/scores": 0.0, - "policy/approxkl_avg": 0.989769458770752, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5546875, - "step": 720, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0015361309051514 - }, - { - "episode": 11552, - "epoch": 0.06921426945153443, - "loss/policy_avg": 0.9960123896598816, - "lr": 9.539238241308795e-06, - "objective/entropy": -102.21640014648438, - "objective/kl": 29.624881744384766, - "objective/non_score_reward": -1.4812440872192383, - "objective/rlhf_reward": -3.9775650007294967, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 1.5700416564941406, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.568359375, - "step": 721, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000302314758301 - }, - { - "episode": 11568, - "epoch": 0.06931013409066399, - "loss/policy_avg": -0.022494332864880562, - "lr": 9.538599182004091e-06, - "objective/entropy": -97.00556182861328, - "objective/kl": 34.23220443725586, - "objective/non_score_reward": -1.7116100788116455, - "objective/rlhf_reward": -5.520927820235414, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 6.0028605461120605, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.625, - "step": 722, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0014429092407227 - }, - { - "episode": 11584, - "epoch": 0.06940599872979353, - "loss/policy_avg": 0.2970792055130005, - "lr": 9.537960122699387e-06, - "objective/entropy": -218.43130493164062, - "objective/kl": 23.677339553833008, - "objective/non_score_reward": -1.1838669776916504, - "objective/rlhf_reward": -0.335467970371246, - "objective/scores": 1.1, - "policy/approxkl_avg": 35.85502624511719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.744140625, - "step": 723, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9957175254821777 - }, - { - "episode": 11600, - "epoch": 0.06950186336892308, - "loss/policy_avg": 0.09062906354665756, - "lr": 9.537321063394683e-06, - "objective/entropy": -145.62179565429688, - "objective/kl": 19.510597229003906, - "objective/non_score_reward": -0.9755299091339111, - "objective/rlhf_reward": -2.560484102278381, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 5.657525539398193, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.671875, - "step": 724, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.002211809158325 - }, - { - "episode": 11616, - "epoch": 0.06959772800805263, - "loss/policy_avg": 0.5650205612182617, - "lr": 9.53668200408998e-06, - "objective/entropy": -189.58197021484375, - "objective/kl": 22.43151092529297, - "objective/non_score_reward": -1.1215755939483643, - "objective/rlhf_reward": -3.1446669011408384, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 16.189781188964844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.69140625, - "step": 725, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9978678226470947 - }, - { - "episode": 11632, - "epoch": 0.06969359264718218, - "loss/policy_avg": 0.10538655519485474, - "lr": 9.536042944785277e-06, - "objective/entropy": -262.17254638671875, - "objective/kl": 21.21435546875, - "objective/non_score_reward": -1.0607177019119263, - "objective/rlhf_reward": -2.1201648137727123, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 5.554556846618652, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7578125, - "step": 726, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9994568824768066 - }, - { - "episode": 11648, - "epoch": 0.06978945728631172, - "loss/policy_avg": 0.08264347910881042, - "lr": 9.535403885480572e-06, - "objective/entropy": -144.35389709472656, - "objective/kl": 23.849288940429688, - "objective/non_score_reward": -1.1924644708633423, - "objective/rlhf_reward": -3.2889052657440896, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 0.2577582895755768, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.60546875, - "step": 727, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0009419918060303 - }, - { - "episode": 11664, - "epoch": 0.06988532192544128, - "loss/policy_avg": -0.11442309617996216, - "lr": 9.53476482617587e-06, - "objective/entropy": -161.91555786132812, - "objective/kl": 29.32978057861328, - "objective/non_score_reward": -1.4664889574050903, - "objective/rlhf_reward": -4.132622496287028, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 6.162350654602051, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58203125, - "step": 728, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0256590843200684 - }, - { - "episode": 11680, - "epoch": 0.06998118656457082, - "loss/policy_avg": 0.15979725122451782, - "lr": 9.534125766871166e-06, - "objective/entropy": -46.392860412597656, - "objective/kl": 34.71672058105469, - "objective/non_score_reward": -1.7358360290527344, - "objective/rlhf_reward": -5.601708403139739, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 26.208736419677734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.837890625, - "step": 729, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9947504997253418 - }, - { - "episode": 11696, - "epoch": 0.07007705120370038, - "loss/policy_avg": 0.01945001818239689, - "lr": 9.533486707566463e-06, - "objective/entropy": -199.32308959960938, - "objective/kl": 20.052722930908203, - "objective/non_score_reward": -1.002636194229126, - "objective/rlhf_reward": -2.586712677677242, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 5.049467086791992, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.623046875, - "step": 730, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9999655485153198 - }, - { - "episode": 11712, - "epoch": 0.07017291584282992, - "loss/policy_avg": 0.22911685705184937, - "lr": 9.53284764826176e-06, - "objective/entropy": -199.43820190429688, - "objective/kl": 29.375852584838867, - "objective/non_score_reward": -1.4687926769256592, - "objective/rlhf_reward": -3.4751707077026364, - "objective/scores": 0.6, - "policy/approxkl_avg": 1.4132235050201416, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.64453125, - "step": 731, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989359378814697 - }, - { - "episode": 11728, - "epoch": 0.07026878048195948, - "loss/policy_avg": 0.045667171478271484, - "lr": 9.532208588957055e-06, - "objective/entropy": -156.77005004882812, - "objective/kl": 28.574951171875, - "objective/non_score_reward": -1.4287474155426025, - "objective/rlhf_reward": -4.110870037142353, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 11.299884796142578, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.587890625, - "step": 732, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989423751831055 - }, - { - "episode": 11744, - "epoch": 0.07036464512108902, - "loss/policy_avg": -0.07621235400438309, - "lr": 9.531569529652352e-06, - "objective/entropy": -211.5927734375, - "objective/kl": 25.139881134033203, - "objective/non_score_reward": -1.2569940090179443, - "objective/rlhf_reward": -3.2031475856629124, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 2.0796079635620117, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.548828125, - "step": 733, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00140118598938 - }, - { - "episode": 11760, - "epoch": 0.07046050976021857, - "loss/policy_avg": 0.3665542006492615, - "lr": 9.530930470347649e-06, - "objective/entropy": -136.42066955566406, - "objective/kl": 28.39642333984375, - "objective/non_score_reward": -1.4198211431503296, - "objective/rlhf_reward": -5.679284453392029, - "objective/scores": 0.0, - "policy/approxkl_avg": 2.8006393909454346, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.673828125, - "step": 734, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9988253116607666 - }, - { - "episode": 11776, - "epoch": 0.07055637439934812, - "loss/policy_avg": -0.16624964773654938, - "lr": 9.530291411042946e-06, - "objective/entropy": -172.16896057128906, - "objective/kl": 32.62467956542969, - "objective/non_score_reward": -1.6312339305877686, - "objective/rlhf_reward": -5.183300068884521, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 8.176142692565918, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.587890625, - "step": 735, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0014700889587402 - }, - { - "episode": 11792, - "epoch": 0.07065223903847767, - "loss/policy_avg": -0.01751716434955597, - "lr": 9.529652351738243e-06, - "objective/entropy": -244.469970703125, - "objective/kl": 21.34896469116211, - "objective/non_score_reward": -1.0674481391906738, - "objective/rlhf_reward": -1.346073900104734, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 1.2310829162597656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.73828125, - "step": 736, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0015974044799805 - }, - { - "episode": 11808, - "epoch": 0.07074810367760721, - "loss/policy_avg": -0.13727766275405884, - "lr": 9.52901329243354e-06, - "objective/entropy": -152.7752227783203, - "objective/kl": 30.841548919677734, - "objective/non_score_reward": -1.5420774221420288, - "objective/rlhf_reward": -1.7683096885681149, - "objective/scores": 1.1, - "policy/approxkl_avg": 2.1432337760925293, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.732421875, - "step": 737, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000338315963745 - }, - { - "episode": 11824, - "epoch": 0.07084396831673677, - "loss/policy_avg": 0.24724145233631134, - "lr": 9.528374233128835e-06, - "objective/entropy": -249.35003662109375, - "objective/kl": 41.97819519042969, - "objective/non_score_reward": -2.098909854888916, - "objective/rlhf_reward": -6.945041160197601, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 14.357757568359375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7421875, - "step": 738, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9984996318817139 - }, - { - "episode": 11840, - "epoch": 0.07093983295586631, - "loss/policy_avg": -0.1166142150759697, - "lr": 9.527735173824132e-06, - "objective/entropy": 16.65149688720703, - "objective/kl": 28.71587371826172, - "objective/non_score_reward": -1.4357936382293701, - "objective/rlhf_reward": -4.401539257078796, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 3.7607579231262207, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.64453125, - "step": 739, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9990723133087158 - }, - { - "episode": 11856, - "epoch": 0.07103569759499587, - "loss/policy_avg": 0.035362888127565384, - "lr": 9.527096114519428e-06, - "objective/entropy": -227.2210235595703, - "objective/kl": 27.349641799926758, - "objective/non_score_reward": -1.36748206615448, - "objective/rlhf_reward": -3.865808401171284, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 9.06348705291748, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6640625, - "step": 740, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9996118545532227 - }, - { - "episode": 11872, - "epoch": 0.07113156223412541, - "loss/policy_avg": 0.31989267468452454, - "lr": 9.526457055214725e-06, - "objective/entropy": -213.7845458984375, - "objective/kl": 34.27381896972656, - "objective/non_score_reward": -1.713691234588623, - "objective/rlhf_reward": -4.732058527246986, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 26.892040252685547, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.67578125, - "step": 741, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0020368099212646 - }, - { - "episode": 11888, - "epoch": 0.07122742687325496, - "loss/policy_avg": 0.18080441653728485, - "lr": 9.525817995910022e-06, - "objective/entropy": -164.34909057617188, - "objective/kl": 29.15081024169922, - "objective/non_score_reward": -1.457540512084961, - "objective/rlhf_reward": -4.379564206214294, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 19.9893798828125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.64453125, - "step": 742, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9979908466339111 - }, - { - "episode": 11904, - "epoch": 0.0713232915123845, - "loss/policy_avg": 0.06947439908981323, - "lr": 9.525178936605317e-06, - "objective/entropy": -35.78013610839844, - "objective/kl": 30.88395118713379, - "objective/non_score_reward": -1.5441975593566895, - "objective/rlhf_reward": -4.620531051364496, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 9.903773307800293, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.658203125, - "step": 743, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0000925064086914 - }, - { - "episode": 11920, - "epoch": 0.07141915615151406, - "loss/policy_avg": 0.4868197441101074, - "lr": 9.524539877300614e-06, - "objective/entropy": -185.67857360839844, - "objective/kl": 30.794139862060547, - "objective/non_score_reward": -1.5397069454193115, - "objective/rlhf_reward": -4.833315048247499, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 23.752399444580078, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5703125, - "step": 744, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9970831871032715 - }, - { - "episode": 11936, - "epoch": 0.0715150207906436, - "loss/policy_avg": 0.4937871992588043, - "lr": 9.52390081799591e-06, - "objective/entropy": -196.15248107910156, - "objective/kl": 32.130393981933594, - "objective/non_score_reward": -1.6065199375152588, - "objective/rlhf_reward": -5.084443858175903, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 4.993836402893066, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.509765625, - "step": 745, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994457960128784 - }, - { - "episode": 11952, - "epoch": 0.07161088542977316, - "loss/policy_avg": 0.10673123598098755, - "lr": 9.523261758691206e-06, - "objective/entropy": -74.68463134765625, - "objective/kl": 34.281944274902344, - "objective/non_score_reward": -1.7140971422195435, - "objective/rlhf_reward": -3.9326697334062786, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 9.657389640808105, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4892578125, - "step": 746, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998197078704834 - }, - { - "episode": 11968, - "epoch": 0.0717067500689027, - "loss/policy_avg": 0.08303539454936981, - "lr": 9.522622699386503e-06, - "objective/entropy": -234.022705078125, - "objective/kl": 26.956684112548828, - "objective/non_score_reward": -1.3478342294692993, - "objective/rlhf_reward": -3.26863074518827, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 9.614282608032227, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.77734375, - "step": 747, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9977872371673584 - }, - { - "episode": 11984, - "epoch": 0.07180261470803226, - "loss/policy_avg": 0.006275704130530357, - "lr": 9.5219836400818e-06, - "objective/entropy": -179.78111267089844, - "objective/kl": 24.191059112548828, - "objective/non_score_reward": -1.2095528841018677, - "objective/rlhf_reward": -3.4789615509256553, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 3.5060572624206543, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.48828125, - "step": 748, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001844882965088 - }, - { - "episode": 12000, - "epoch": 0.07189847934716181, - "loss/policy_avg": 0.05262988060712814, - "lr": 9.521344580777097e-06, - "objective/entropy": -61.52648162841797, - "objective/kl": 24.345882415771484, - "objective/non_score_reward": -1.2172942161560059, - "objective/rlhf_reward": -2.7464705727258067, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 26.343456268310547, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.734375, - "step": 749, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9996674060821533 - }, - { - "episode": 12016, - "epoch": 0.07199434398629136, - "loss/policy_avg": 0.1489763706922531, - "lr": 9.520705521472394e-06, - "objective/entropy": -179.14523315429688, - "objective/kl": 25.692440032958984, - "objective/non_score_reward": -1.284622073173523, - "objective/rlhf_reward": -3.19107700415128, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 1.4589556455612183, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.517578125, - "step": 750, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9989516735076904 - }, - { - "episode": 12032, - "epoch": 0.07209020862542091, - "loss/policy_avg": 0.06708867847919464, - "lr": 9.520066462167689e-06, - "objective/entropy": -56.47541427612305, - "objective/kl": 42.95630645751953, - "objective/non_score_reward": -2.147815227508545, - "objective/rlhf_reward": -6.7664322808113795, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 6.856327056884766, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.650390625, - "step": 751, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9969167709350586 - }, - { - "episode": 12048, - "epoch": 0.07218607326455045, - "loss/policy_avg": 0.3973958194255829, - "lr": 9.519427402862986e-06, - "objective/entropy": -244.11431884765625, - "objective/kl": 25.62933921813965, - "objective/non_score_reward": -1.2814669609069824, - "objective/rlhf_reward": -3.301039035591196, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 48.01885223388672, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.671875, - "step": 752, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9983410835266113 - }, - { - "episode": 12064, - "epoch": 0.07228193790368001, - "loss/policy_avg": 0.016892850399017334, - "lr": 9.518788343558283e-06, - "objective/entropy": -233.80613708496094, - "objective/kl": 33.0050048828125, - "objective/non_score_reward": -1.6502504348754883, - "objective/rlhf_reward": -4.653590510563786, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 15.416328430175781, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.560546875, - "step": 753, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.999025821685791 - }, - { - "episode": 12080, - "epoch": 0.07237780254280955, - "loss/policy_avg": 0.10087546706199646, - "lr": 9.51814928425358e-06, - "objective/entropy": -283.5254211425781, - "objective/kl": 25.051952362060547, - "objective/non_score_reward": -1.2525975704193115, - "objective/rlhf_reward": -2.6103905797004696, - "objective/scores": 0.6, - "policy/approxkl_avg": 19.29462432861328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6328125, - "step": 754, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9981729984283447 - }, - { - "episode": 12096, - "epoch": 0.07247366718193911, - "loss/policy_avg": 0.24108710885047913, - "lr": 9.517510224948877e-06, - "objective/entropy": -211.13575744628906, - "objective/kl": 35.66078186035156, - "objective/non_score_reward": -1.7830390930175781, - "objective/rlhf_reward": -5.708324392040339, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 11.15980339050293, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.60546875, - "step": 755, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9982430934906006 - }, - { - "episode": 12112, - "epoch": 0.07256953182106865, - "loss/policy_avg": 0.6718421578407288, - "lr": 9.516871165644172e-06, - "objective/entropy": -148.00872802734375, - "objective/kl": 30.348403930664062, - "objective/non_score_reward": -1.5174202919006348, - "objective/rlhf_reward": -4.669681048393249, - "objective/scores": 0.35, - "policy/approxkl_avg": 24.264657974243164, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.626953125, - "step": 756, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989105463027954 - }, - { - "episode": 12128, - "epoch": 0.0726653964601982, - "loss/policy_avg": 0.17684796452522278, - "lr": 9.516232106339469e-06, - "objective/entropy": -220.75283813476562, - "objective/kl": 18.81310272216797, - "objective/non_score_reward": -0.9406551122665405, - "objective/rlhf_reward": -2.3840183998025477, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 3.753880500793457, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.498046875, - "step": 757, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9982280731201172 - }, - { - "episode": 12144, - "epoch": 0.07276126109932775, - "loss/policy_avg": 0.5594636797904968, - "lr": 9.515593047034765e-06, - "objective/entropy": -182.7705535888672, - "objective/kl": 19.829849243164062, - "objective/non_score_reward": -0.991492509841919, - "objective/rlhf_reward": 0.4340301394462589, - "objective/scores": 1.1, - "policy/approxkl_avg": 28.46674346923828, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.572265625, - "step": 758, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9991915225982666 - }, - { - "episode": 12160, - "epoch": 0.0728571257384573, - "loss/policy_avg": 0.6502060890197754, - "lr": 9.514953987730062e-06, - "objective/entropy": -112.33629608154297, - "objective/kl": 39.52580642700195, - "objective/non_score_reward": -1.9762903451919556, - "objective/rlhf_reward": -5.78245514847425, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 4.3783769607543945, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.57421875, - "step": 759, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9975318908691406 - }, - { - "episode": 12176, - "epoch": 0.07295299037758685, - "loss/policy_avg": 0.88495934009552, - "lr": 9.51431492842536e-06, - "objective/entropy": -201.14666748046875, - "objective/kl": 27.90923309326172, - "objective/non_score_reward": -1.3954615592956543, - "objective/rlhf_reward": -4.240210583716064, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 3.1258697509765625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66796875, - "step": 760, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0020923614501953 - }, - { - "episode": 12192, - "epoch": 0.0730488550167164, - "loss/policy_avg": 0.3271714448928833, - "lr": 9.513675869120656e-06, - "objective/entropy": -236.55361938476562, - "objective/kl": 28.77971839904785, - "objective/non_score_reward": -1.43898606300354, - "objective/rlhf_reward": -4.35594413280487, - "objective/scores": 0.35, - "policy/approxkl_avg": 5.469420909881592, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6171875, - "step": 761, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997791051864624 - }, - { - "episode": 12208, - "epoch": 0.07314471965584594, - "loss/policy_avg": 0.0032866448163986206, - "lr": 9.513036809815951e-06, - "objective/entropy": -200.22227478027344, - "objective/kl": 28.73204803466797, - "objective/non_score_reward": -1.4366023540496826, - "objective/rlhf_reward": -4.142289552752095, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 0.5752939581871033, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.587890625, - "step": 762, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0023627281188965 - }, - { - "episode": 12224, - "epoch": 0.0732405842949755, - "loss/policy_avg": 0.21868771314620972, - "lr": 9.512397750511248e-06, - "objective/entropy": -187.9447784423828, - "objective/kl": 20.44854736328125, - "objective/non_score_reward": -1.0224274396896362, - "objective/rlhf_reward": -2.5739379761540255, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 11.789055824279785, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6328125, - "step": 763, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9979248046875 - }, - { - "episode": 12240, - "epoch": 0.07333644893410504, - "loss/policy_avg": 0.3879333734512329, - "lr": 9.511758691206545e-06, - "objective/entropy": -267.96685791015625, - "objective/kl": 28.91057586669922, - "objective/non_score_reward": -1.4455287456512451, - "objective/rlhf_reward": -3.8347037536668136, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 4.813044548034668, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6953125, - "step": 764, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0009827613830566 - }, - { - "episode": 12256, - "epoch": 0.0734323135732346, - "loss/policy_avg": 0.06569409370422363, - "lr": 9.511119631901842e-06, - "objective/entropy": -207.83352661132812, - "objective/kl": 24.208805084228516, - "objective/non_score_reward": -1.2104402780532837, - "objective/rlhf_reward": -3.2855019261508733, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 2.039762258529663, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.658203125, - "step": 765, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0008108615875244 - }, - { - "episode": 12272, - "epoch": 0.07352817821236414, - "loss/policy_avg": 0.9109029769897461, - "lr": 9.510480572597139e-06, - "objective/entropy": -85.82101440429688, - "objective/kl": 31.18517303466797, - "objective/non_score_reward": -1.5592585802078247, - "objective/rlhf_reward": -4.50370092789332, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 4.987689018249512, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.56640625, - "step": 766, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.997962236404419 - }, - { - "episode": 12288, - "epoch": 0.0736240428514937, - "loss/policy_avg": 0.44006603956222534, - "lr": 9.509841513292434e-06, - "objective/entropy": -254.5596923828125, - "objective/kl": 26.123559951782227, - "objective/non_score_reward": -1.3061779737472534, - "objective/rlhf_reward": -3.6684524109035284, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 13.005337715148926, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.552734375, - "step": 767, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9978489875793457 - }, - { - "episode": 12304, - "epoch": 0.07371990749062324, - "loss/policy_avg": 0.14191022515296936, - "lr": 9.509202453987731e-06, - "objective/entropy": -185.1569061279297, - "objective/kl": 38.093666076660156, - "objective/non_score_reward": -1.9046835899353027, - "objective/rlhf_reward": -7.618734002113342, - "objective/scores": 0.0, - "policy/approxkl_avg": 60.80290603637695, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.640625, - "step": 768, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9955925941467285 - }, - { - "episode": 12320, - "epoch": 0.0738157721297528, - "loss/policy_avg": -0.31537145376205444, - "lr": 9.508563394683026e-06, - "objective/entropy": -164.9215087890625, - "objective/kl": 30.594449996948242, - "objective/non_score_reward": -1.5297224521636963, - "objective/rlhf_reward": -4.63793725055015, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 1.2754226922988892, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.6171875, - "step": 769, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001909017562866 - }, - { - "episode": 12336, - "epoch": 0.07391163676888234, - "loss/policy_avg": 0.034731436520814896, - "lr": 9.507924335378323e-06, - "objective/entropy": -200.43959045410156, - "objective/kl": 36.4830436706543, - "objective/non_score_reward": -1.8241522312164307, - "objective/rlhf_reward": -5.8727765872078805, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 3.3153905868530273, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.626953125, - "step": 770, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0031118392944336 - }, - { - "episode": 12352, - "epoch": 0.07400750140801189, - "loss/policy_avg": 0.29965466260910034, - "lr": 9.50728527607362e-06, - "objective/entropy": -168.58261108398438, - "objective/kl": 34.881736755371094, - "objective/non_score_reward": -1.7440869808197021, - "objective/rlhf_reward": -5.314488296926605, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 12.419918060302734, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.744140625, - "step": 771, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999511957168579 - }, - { - "episode": 12368, - "epoch": 0.07410336604714143, - "loss/policy_avg": 0.5840628743171692, - "lr": 9.506646216768917e-06, - "objective/entropy": -149.50210571289062, - "objective/kl": 26.40768051147461, - "objective/non_score_reward": -1.3203840255737305, - "objective/rlhf_reward": -3.8005837230042214, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 10.453241348266602, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.560546875, - "step": 772, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990813732147217 - }, - { - "episode": 12384, - "epoch": 0.07419923068627099, - "loss/policy_avg": -0.20146791636943817, - "lr": 9.506007157464214e-06, - "objective/entropy": -206.66688537597656, - "objective/kl": 25.146541595458984, - "objective/non_score_reward": -1.2573271989822388, - "objective/rlhf_reward": -5.029308795928955, - "objective/scores": 0.0, - "policy/approxkl_avg": 55.61228561401367, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.728515625, - "step": 773, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9993964433670044 - }, - { - "episode": 12400, - "epoch": 0.07429509532540053, - "loss/policy_avg": 2.0998456478118896, - "lr": 9.50536809815951e-06, - "objective/entropy": -135.09249877929688, - "objective/kl": 26.86371612548828, - "objective/non_score_reward": -1.3431859016418457, - "objective/rlhf_reward": -3.922145526023254, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 7.190234184265137, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.548828125, - "step": 774, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0000576972961426 - }, - { - "episode": 12416, - "epoch": 0.07439095996453009, - "loss/policy_avg": 0.024284163489937782, - "lr": 9.504729038854806e-06, - "objective/entropy": -269.6484375, - "objective/kl": 21.226428985595703, - "objective/non_score_reward": -1.061321496963501, - "objective/rlhf_reward": -2.7946879669145197, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 6.07242488861084, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.66796875, - "step": 775, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999997615814209 - }, - { - "episode": 12432, - "epoch": 0.07448682460365963, - "loss/policy_avg": 0.03317616134881973, - "lr": 9.504089979550103e-06, - "objective/entropy": -234.43389892578125, - "objective/kl": 27.79866600036621, - "objective/non_score_reward": -1.3899333477020264, - "objective/rlhf_reward": -3.8264001766840616, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 1.3638486862182617, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.689453125, - "step": 776, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9996023178100586 - }, - { - "episode": 12448, - "epoch": 0.07458268924278919, - "loss/policy_avg": 0.16213266551494598, - "lr": 9.5034509202454e-06, - "objective/entropy": -203.708740234375, - "objective/kl": 38.612911224365234, - "objective/non_score_reward": -1.9306457042694092, - "objective/rlhf_reward": -6.271984438510284, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 1.698218584060669, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.64453125, - "step": 777, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9995825290679932 - }, - { - "episode": 12464, - "epoch": 0.07467855388191873, - "loss/policy_avg": 0.2597602605819702, - "lr": 9.502811860940696e-06, - "objective/entropy": -250.4356231689453, - "objective/kl": 30.581310272216797, - "objective/non_score_reward": -1.529065489768982, - "objective/rlhf_reward": -4.737660029021603, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 8.781853675842285, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.615234375, - "step": 778, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9989944696426392 - }, - { - "episode": 12480, - "epoch": 0.07477441852104828, - "loss/policy_avg": -0.24061758816242218, - "lr": 9.502172801635993e-06, - "objective/entropy": -98.61205291748047, - "objective/kl": 26.375612258911133, - "objective/non_score_reward": -1.3187806606292725, - "objective/rlhf_reward": -3.794169786389231, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 5.955351829528809, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.7265625, - "step": 779, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.003558397293091 - }, - { - "episode": 12496, - "epoch": 0.07487028316017783, - "loss/policy_avg": 0.48288995027542114, - "lr": 9.50153374233129e-06, - "objective/entropy": -230.7918701171875, - "objective/kl": 37.52941131591797, - "objective/non_score_reward": -1.8764704465866089, - "objective/rlhf_reward": -6.024929526265025, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 12.408464431762695, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.68359375, - "step": 780, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9990174770355225 - }, - { - "episode": 12512, - "epoch": 0.07496614779930738, - "loss/policy_avg": 0.27871203422546387, - "lr": 9.500894683026585e-06, - "objective/entropy": -159.85903930664062, - "objective/kl": 25.038909912109375, - "objective/non_score_reward": -1.2519454956054688, - "objective/rlhf_reward": -2.607781863212585, - "objective/scores": 0.6, - "policy/approxkl_avg": 46.26438903808594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 781, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000070095062256 - }, - { - "episode": 12528, - "epoch": 0.07506201243843692, - "loss/policy_avg": 0.06291055679321289, - "lr": 9.500255623721882e-06, - "objective/entropy": -163.0406494140625, - "objective/kl": 27.101749420166016, - "objective/non_score_reward": -1.3550875186920166, - "objective/rlhf_reward": -4.061099970076961, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 13.61475658416748, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.63671875, - "step": 782, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986028671264648 - }, - { - "episode": 12544, - "epoch": 0.07515787707756648, - "loss/policy_avg": 0.07766500115394592, - "lr": 9.499616564417179e-06, - "objective/entropy": -264.68377685546875, - "objective/kl": 26.38882827758789, - "objective/non_score_reward": -1.319441556930542, - "objective/rlhf_reward": -2.3540468558084697, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 29.816272735595703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 783, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9993165731430054 - }, - { - "episode": 12560, - "epoch": 0.07525374171669602, - "loss/policy_avg": -0.25779616832733154, - "lr": 9.498977505112476e-06, - "objective/entropy": -192.4373016357422, - "objective/kl": 30.569807052612305, - "objective/non_score_reward": -1.528490424156189, - "objective/rlhf_reward": -4.5098417139688305, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 3.409776210784912, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.765625, - "step": 784, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0031237602233887 - }, - { - "episode": 12576, - "epoch": 0.07534960635582558, - "loss/policy_avg": -0.23182180523872375, - "lr": 9.498338445807773e-06, - "objective/entropy": -116.57367706298828, - "objective/kl": 30.319534301757812, - "objective/non_score_reward": -1.5159766674041748, - "objective/rlhf_reward": -4.704657160972042, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 3.2308108806610107, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.3955078125, - "step": 785, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001164436340332 - }, - { - "episode": 12592, - "epoch": 0.07544547099495512, - "loss/policy_avg": 0.270114541053772, - "lr": 9.497699386503068e-06, - "objective/entropy": -213.6279296875, - "objective/kl": 34.02395248413086, - "objective/non_score_reward": -1.701197624206543, - "objective/rlhf_reward": -3.8810713633310527, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 4.159467697143555, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.556640625, - "step": 786, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999849557876587 - }, - { - "episode": 12608, - "epoch": 0.07554133563408467, - "loss/policy_avg": 0.01593317836523056, - "lr": 9.497060327198365e-06, - "objective/entropy": -83.6307601928711, - "objective/kl": 28.397233963012695, - "objective/non_score_reward": -1.4198617935180664, - "objective/rlhf_reward": -4.198494317944407, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 13.974614143371582, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.576171875, - "step": 787, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9974963665008545 - }, - { - "episode": 12624, - "epoch": 0.07563720027321422, - "loss/policy_avg": 0.122782863676548, - "lr": 9.496421267893662e-06, - "objective/entropy": -66.27203369140625, - "objective/kl": 20.0443115234375, - "objective/non_score_reward": -1.0022156238555908, - "objective/rlhf_reward": -2.6302602673448146, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 5.128955364227295, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.78125, - "step": 788, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.002767562866211 - }, - { - "episode": 12640, - "epoch": 0.07573306491234377, - "loss/policy_avg": 0.06789802759885788, - "lr": 9.495782208588959e-06, - "objective/entropy": -174.1296844482422, - "objective/kl": 28.25243377685547, - "objective/non_score_reward": -1.4126217365264893, - "objective/rlhf_reward": -3.988627438963042, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 21.132152557373047, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.802734375, - "step": 789, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0010154247283936 - }, - { - "episode": 12656, - "epoch": 0.07582892955147331, - "loss/policy_avg": 0.1666814684867859, - "lr": 9.495143149284254e-06, - "objective/entropy": -226.70257568359375, - "objective/kl": 28.976097106933594, - "objective/non_score_reward": -1.4488048553466797, - "objective/rlhf_reward": -4.371387500961391, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 2.0613138675689697, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.62890625, - "step": 790, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994125366210938 - }, - { - "episode": 12672, - "epoch": 0.07592479419060287, - "loss/policy_avg": 0.1284073442220688, - "lr": 9.49450408997955e-06, - "objective/entropy": -215.84002685546875, - "objective/kl": 28.486852645874023, - "objective/non_score_reward": -1.4243427515029907, - "objective/rlhf_reward": -4.355735114126831, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 5.659012317657471, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.64453125, - "step": 791, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0002176761627197 - }, - { - "episode": 12688, - "epoch": 0.07602065882973241, - "loss/policy_avg": -0.04723303020000458, - "lr": 9.493865030674848e-06, - "objective/entropy": -227.61280822753906, - "objective/kl": 28.772476196289062, - "objective/non_score_reward": -1.4386236667633057, - "objective/rlhf_reward": -2.830775891185972, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 1.8349313735961914, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.609375, - "step": 792, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.002286911010742 - }, - { - "episode": 12704, - "epoch": 0.07611652346886197, - "loss/policy_avg": -0.01974731869995594, - "lr": 9.493225971370144e-06, - "objective/entropy": -168.45291137695312, - "objective/kl": 32.674957275390625, - "objective/non_score_reward": -1.633747935295105, - "objective/rlhf_reward": -5.209478828936739, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 0.8098639249801636, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015625, - "step": 793, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0004916191101074 - }, - { - "episode": 12720, - "epoch": 0.07621238810799151, - "loss/policy_avg": 0.3524478077888489, - "lr": 9.49258691206544e-06, - "objective/entropy": -170.04669189453125, - "objective/kl": 35.1775016784668, - "objective/non_score_reward": -1.7588751316070557, - "objective/rlhf_reward": -5.479241101947382, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 4.70783805847168, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.658203125, - "step": 794, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9981024265289307 - }, - { - "episode": 12736, - "epoch": 0.07630825274712107, - "loss/policy_avg": 0.14937232434749603, - "lr": 9.491947852760736e-06, - "objective/entropy": -258.00518798828125, - "objective/kl": 30.382396697998047, - "objective/non_score_reward": -1.5191197395324707, - "objective/rlhf_reward": -4.472358975473957, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 8.522323608398438, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.68359375, - "step": 795, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.00205659866333 - }, - { - "episode": 12752, - "epoch": 0.07640411738625061, - "loss/policy_avg": 0.4101511240005493, - "lr": 9.491308793456033e-06, - "objective/entropy": -97.3719482421875, - "objective/kl": 49.89447021484375, - "objective/non_score_reward": -2.4947237968444824, - "objective/rlhf_reward": -7.578894591331482, - "objective/scores": 0.6, - "policy/approxkl_avg": 19.377134323120117, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.498046875, - "step": 796, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981412887573242 - }, - { - "episode": 12768, - "epoch": 0.07649998202538016, - "loss/policy_avg": -0.0627971962094307, - "lr": 9.49066973415133e-06, - "objective/entropy": -110.8655776977539, - "objective/kl": 44.73468017578125, - "objective/non_score_reward": -2.23673415184021, - "objective/rlhf_reward": -6.546936726570129, - "objective/scores": 0.6, - "policy/approxkl_avg": 5.804272651672363, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.806640625, - "step": 797, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9971017837524414 - }, - { - "episode": 12784, - "epoch": 0.0765958466645097, - "loss/policy_avg": 0.3731452226638794, - "lr": 9.490030674846627e-06, - "objective/entropy": -15.07757568359375, - "objective/kl": 24.15683364868164, - "objective/non_score_reward": -1.2078416347503662, - "objective/rlhf_reward": -2.4313664793968197, - "objective/scores": 0.6, - "policy/approxkl_avg": 5.745340347290039, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.638671875, - "step": 798, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993362426757812 - }, - { - "episode": 12800, - "epoch": 0.07669171130363926, - "loss/policy_avg": 0.3336324691772461, - "lr": 9.489391615541922e-06, - "objective/entropy": -249.59414672851562, - "objective/kl": 28.68617820739746, - "objective/non_score_reward": -1.4343090057373047, - "objective/rlhf_reward": -2.8135166510355205, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 3.9479708671569824, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.693359375, - "step": 799, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993374347686768 - }, - { - "episode": 12816, - "epoch": 0.0767875759427688, - "loss/policy_avg": 0.12261458486318588, - "lr": 9.488752556237219e-06, - "objective/entropy": -207.68580627441406, - "objective/kl": 33.91386413574219, - "objective/non_score_reward": -1.6956932544708252, - "objective/rlhf_reward": -5.301820400174021, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 25.18114471435547, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625, - "step": 800, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9971894025802612 - }, - { - "episode": 12832, - "epoch": 0.07688344058189836, - "loss/policy_avg": 0.1192292720079422, - "lr": 9.488113496932516e-06, - "objective/entropy": -268.4300842285156, - "objective/kl": 26.710205078125, - "objective/non_score_reward": -1.3355103731155396, - "objective/rlhf_reward": -4.000405719786316, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 4.064979553222656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.638671875, - "step": 801, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9994006156921387 - }, - { - "episode": 12848, - "epoch": 0.0769793052210279, - "loss/policy_avg": 0.4274081587791443, - "lr": 9.487474437627813e-06, - "objective/entropy": -125.00625610351562, - "objective/kl": 36.30561065673828, - "objective/non_score_reward": -1.815280795097351, - "objective/rlhf_reward": -5.901873194907589, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 9.215574264526367, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.55078125, - "step": 802, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999779224395752 - }, - { - "episode": 12864, - "epoch": 0.07707516986015746, - "loss/policy_avg": 0.02082793414592743, - "lr": 9.48683537832311e-06, - "objective/entropy": 49.048545837402344, - "objective/kl": 31.830245971679688, - "objective/non_score_reward": -1.5915122032165527, - "objective/rlhf_reward": -4.915450672717437, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 2.6811680793762207, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4384765625, - "step": 803, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994103908538818 - }, - { - "episode": 12880, - "epoch": 0.077171034499287, - "loss/policy_avg": 0.1582624763250351, - "lr": 9.486196319018407e-06, - "objective/entropy": -110.25260925292969, - "objective/kl": 31.00435447692871, - "objective/non_score_reward": -1.550217866897583, - "objective/rlhf_reward": -3.8008712291717526, - "objective/scores": 0.6, - "policy/approxkl_avg": 3.5253429412841797, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.55859375, - "step": 804, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0027151107788086 - }, - { - "episode": 12896, - "epoch": 0.07726689913841656, - "loss/policy_avg": 0.09249435365200043, - "lr": 9.485557259713702e-06, - "objective/entropy": -203.63662719726562, - "objective/kl": 31.04816436767578, - "objective/non_score_reward": -1.552408218383789, - "objective/rlhf_reward": -4.547773247182952, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.3485993146896362, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.646484375, - "step": 805, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999431848526001 - }, - { - "episode": 12912, - "epoch": 0.0773627637775461, - "loss/policy_avg": 0.44563794136047363, - "lr": 9.484918200408999e-06, - "objective/entropy": -163.74508666992188, - "objective/kl": 31.982746124267578, - "objective/non_score_reward": -1.599137306213379, - "objective/rlhf_reward": -3.472829972149107, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 87.72571563720703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.70703125, - "step": 806, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001553535461426 - }, - { - "episode": 12928, - "epoch": 0.07745862841667565, - "loss/policy_avg": -0.017649848014116287, - "lr": 9.484279141104296e-06, - "objective/entropy": -266.5451965332031, - "objective/kl": 27.058134078979492, - "objective/non_score_reward": -1.3529068231582642, - "objective/rlhf_reward": -1.0116270542144772, - "objective/scores": 1.1, - "policy/approxkl_avg": 5.037982940673828, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.654296875, - "step": 807, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0039749145507812 - }, - { - "episode": 12944, - "epoch": 0.07755449305580521, - "loss/policy_avg": 5.042888641357422, - "lr": 9.483640081799592e-06, - "objective/entropy": -212.65740966796875, - "objective/kl": 24.790084838867188, - "objective/non_score_reward": -1.2395042181015015, - "objective/rlhf_reward": -3.4770642546967263, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 11.046760559082031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.533203125, - "step": 808, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0002431869506836 - }, - { - "episode": 12960, - "epoch": 0.07765035769493475, - "loss/policy_avg": -0.07623002678155899, - "lr": 9.48300102249489e-06, - "objective/entropy": -167.7131805419922, - "objective/kl": 31.204689025878906, - "objective/non_score_reward": -1.5602343082427979, - "objective/rlhf_reward": -4.790339152427062, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 5.110037803649902, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5234375, - "step": 809, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9989296197891235 - }, - { - "episode": 12976, - "epoch": 0.07774622233406431, - "loss/policy_avg": 0.0697702169418335, - "lr": 9.482361963190185e-06, - "objective/entropy": -99.56057739257812, - "objective/kl": 40.95980453491211, - "objective/non_score_reward": -2.047990322113037, - "objective/rlhf_reward": -5.268241856933805, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 1.0177828073501587, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.595703125, - "step": 810, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9999544620513916 - }, - { - "episode": 12992, - "epoch": 0.07784208697319385, - "loss/policy_avg": 0.011765815317630768, - "lr": 9.481722903885481e-06, - "objective/entropy": -270.2078857421875, - "objective/kl": 32.53266906738281, - "objective/non_score_reward": -1.6266334056854248, - "objective/rlhf_reward": -4.950274675098017, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 10.882495880126953, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.685546875, - "step": 811, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.997708797454834 - }, - { - "episode": 13008, - "epoch": 0.0779379516123234, - "loss/policy_avg": 0.4012794494628906, - "lr": 9.481083844580777e-06, - "objective/entropy": -139.22914123535156, - "objective/kl": 37.05573272705078, - "objective/non_score_reward": -1.8527867794036865, - "objective/rlhf_reward": -5.586318249973367, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 210.83877563476562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6640625, - "step": 812, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001636505126953 - }, - { - "episode": 13024, - "epoch": 0.07803381625145295, - "loss/policy_avg": 0.2699980139732361, - "lr": 9.480444785276073e-06, - "objective/entropy": -196.59963989257812, - "objective/kl": 30.699893951416016, - "objective/non_score_reward": -1.5349947214126587, - "objective/rlhf_reward": -3.7399788856506344, - "objective/scores": 0.6, - "policy/approxkl_avg": 2.332146167755127, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.619140625, - "step": 813, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9989255666732788 - }, - { - "episode": 13040, - "epoch": 0.0781296808905825, - "loss/policy_avg": 0.20207370817661285, - "lr": 9.47980572597137e-06, - "objective/entropy": -267.2593994140625, - "objective/kl": 33.34029006958008, - "objective/non_score_reward": -1.6670145988464355, - "objective/rlhf_reward": -5.342545185118837, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 1.632169246673584, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.564453125, - "step": 814, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9993088245391846 - }, - { - "episode": 13056, - "epoch": 0.07822554552971205, - "loss/policy_avg": 0.1745888739824295, - "lr": 9.479166666666667e-06, - "objective/entropy": -108.20680236816406, - "objective/kl": 35.203025817871094, - "objective/non_score_reward": -1.7601512670516968, - "objective/rlhf_reward": -5.484345762935236, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 8.32550048828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.736328125, - "step": 815, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0030176639556885 - }, - { - "episode": 13072, - "epoch": 0.0783214101688416, - "loss/policy_avg": 0.2600640654563904, - "lr": 9.478527607361964e-06, - "objective/entropy": -204.03048706054688, - "objective/kl": 40.41114807128906, - "objective/non_score_reward": -2.020557403564453, - "objective/rlhf_reward": -6.74059360316339, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 4.140628814697266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.619140625, - "step": 816, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0000810623168945 - }, - { - "episode": 13088, - "epoch": 0.07841727480797114, - "loss/policy_avg": 0.5273202061653137, - "lr": 9.477888548057261e-06, - "objective/entropy": -241.156494140625, - "objective/kl": 24.541404724121094, - "objective/non_score_reward": -1.2270702123641968, - "objective/rlhf_reward": -3.246421401918517, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 14.965031623840332, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.61328125, - "step": 817, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9990627765655518 - }, - { - "episode": 13104, - "epoch": 0.0785131394471007, - "loss/policy_avg": -0.09151424467563629, - "lr": 9.477249488752556e-06, - "objective/entropy": -219.21754455566406, - "objective/kl": 31.261905670166016, - "objective/non_score_reward": -1.5630953311920166, - "objective/rlhf_reward": -4.4275525763359775, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 2.8227334022521973, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.833984375, - "step": 818, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0036370754241943 - }, - { - "episode": 13120, - "epoch": 0.07860900408623024, - "loss/policy_avg": 0.13953115046024323, - "lr": 9.476610429447853e-06, - "objective/entropy": -186.8937530517578, - "objective/kl": 27.69632339477539, - "objective/non_score_reward": -1.3848161697387695, - "objective/rlhf_reward": -3.1392647981643673, - "objective/scores": 0.6, - "policy/approxkl_avg": 3.2056455612182617, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015625, - "step": 819, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.003466844558716 - }, - { - "episode": 13136, - "epoch": 0.0787048687253598, - "loss/policy_avg": 0.6420396566390991, - "lr": 9.47597137014315e-06, - "objective/entropy": -134.00025939941406, - "objective/kl": 22.993852615356445, - "objective/non_score_reward": -1.1496926546096802, - "objective/rlhf_reward": -2.651359389500554, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 3.607414722442627, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.611328125, - "step": 820, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000748634338379 - }, - { - "episode": 13152, - "epoch": 0.07880073336448934, - "loss/policy_avg": 0.08356916159391403, - "lr": 9.475332310838447e-06, - "objective/entropy": -189.72003173828125, - "objective/kl": 26.506973266601562, - "objective/non_score_reward": -1.3253486156463623, - "objective/rlhf_reward": -3.959759166746765, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 4.290050029754639, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 821, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9971721172332764 - }, - { - "episode": 13168, - "epoch": 0.0788965980036189, - "loss/policy_avg": 0.11917827278375626, - "lr": 9.474693251533744e-06, - "objective/entropy": -207.30722045898438, - "objective/kl": 35.41877746582031, - "objective/non_score_reward": -1.7709391117095947, - "objective/rlhf_reward": -5.683756327629089, - "objective/scores": 0.35, - "policy/approxkl_avg": 6.870448112487793, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.57421875, - "step": 822, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9955227375030518 - }, - { - "episode": 13184, - "epoch": 0.07899246264274844, - "loss/policy_avg": -0.3528624475002289, - "lr": 9.474054192229039e-06, - "objective/entropy": -138.19627380371094, - "objective/kl": 27.491954803466797, - "objective/non_score_reward": -1.3745976686477661, - "objective/rlhf_reward": -4.156755199938446, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 1.025694727897644, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.52734375, - "step": 823, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0017786026000977 - }, - { - "episode": 13200, - "epoch": 0.079088327281878, - "loss/policy_avg": 0.3965766727924347, - "lr": 9.473415132924336e-06, - "objective/entropy": -244.5587921142578, - "objective/kl": 28.361434936523438, - "objective/non_score_reward": -1.4180717468261719, - "objective/rlhf_reward": -3.549580337778602, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 22.821792602539062, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.53515625, - "step": 824, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.997540831565857 - }, - { - "episode": 13216, - "epoch": 0.07918419192100754, - "loss/policy_avg": 0.183881938457489, - "lr": 9.472776073619633e-06, - "objective/entropy": -235.81063842773438, - "objective/kl": 35.635047912597656, - "objective/non_score_reward": -1.7817524671554565, - "objective/rlhf_reward": -5.785373976736694, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 13.993101119995117, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.681640625, - "step": 825, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9987282752990723 - }, - { - "episode": 13232, - "epoch": 0.07928005656013709, - "loss/policy_avg": 0.13472305238246918, - "lr": 9.47213701431493e-06, - "objective/entropy": -209.61251831054688, - "objective/kl": 32.511722564697266, - "objective/non_score_reward": -1.6255862712860107, - "objective/rlhf_reward": -5.176832351714296, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 10.514575958251953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.693359375, - "step": 826, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9980220794677734 - }, - { - "episode": 13248, - "epoch": 0.07937592119926663, - "loss/policy_avg": 0.28974202275276184, - "lr": 9.471497955010226e-06, - "objective/entropy": -277.55413818359375, - "objective/kl": 23.343517303466797, - "objective/non_score_reward": -1.1671757698059082, - "objective/rlhf_reward": -4.668703377246857, - "objective/scores": 0.0, - "policy/approxkl_avg": 4.868777275085449, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.595703125, - "step": 827, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9984409809112549 - }, - { - "episode": 13264, - "epoch": 0.07947178583839619, - "loss/policy_avg": 0.0649593323469162, - "lr": 9.470858895705523e-06, - "objective/entropy": -168.09161376953125, - "objective/kl": 32.58544921875, - "objective/non_score_reward": -1.6292723417282104, - "objective/rlhf_reward": -5.001317584308323, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 13.682709693908691, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.630859375, - "step": 828, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994233846664429 - }, - { - "episode": 13280, - "epoch": 0.07956765047752573, - "loss/policy_avg": 0.01223127543926239, - "lr": 9.470219836400818e-06, - "objective/entropy": -24.52312469482422, - "objective/kl": 37.070613861083984, - "objective/non_score_reward": -1.8535306453704834, - "objective/rlhf_reward": -5.963524679751739, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 1.4948031902313232, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.591796875, - "step": 829, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0006215572357178 - }, - { - "episode": 13296, - "epoch": 0.07966351511665529, - "loss/policy_avg": 0.08012821525335312, - "lr": 9.469580777096115e-06, - "objective/entropy": -222.74710083007812, - "objective/kl": 29.31437873840332, - "objective/non_score_reward": -1.4657189846038818, - "objective/rlhf_reward": -5.862875819206238, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.948197364807129, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.720703125, - "step": 830, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999361515045166 - }, - { - "episode": 13312, - "epoch": 0.07975937975578483, - "loss/policy_avg": 0.25244101881980896, - "lr": 9.468941717791412e-06, - "objective/entropy": -256.2400817871094, - "objective/kl": 25.82564926147461, - "objective/non_score_reward": -1.2912824153900146, - "objective/rlhf_reward": -5.165129542350769, - "objective/scores": 0.0, - "policy/approxkl_avg": 25.767894744873047, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.625, - "step": 831, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9958332777023315 - }, - { - "episode": 13328, - "epoch": 0.07985524439491438, - "loss/policy_avg": 0.20151713490486145, - "lr": 9.468302658486709e-06, - "objective/entropy": -176.53012084960938, - "objective/kl": 31.989328384399414, - "objective/non_score_reward": -1.5994665622711182, - "objective/rlhf_reward": -4.94726787051712, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 6.573209762573242, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65234375, - "step": 832, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0001397132873535 - }, - { - "episode": 13344, - "epoch": 0.07995110903404393, - "loss/policy_avg": 0.40637868642807007, - "lr": 9.467663599182006e-06, - "objective/entropy": -157.83944702148438, - "objective/kl": 26.236248016357422, - "objective/non_score_reward": -1.311812400817871, - "objective/rlhf_reward": -2.847249662876129, - "objective/scores": 0.6, - "policy/approxkl_avg": 41.408966064453125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69921875, - "step": 833, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9979543685913086 - }, - { - "episode": 13360, - "epoch": 0.08004697367317348, - "loss/policy_avg": 0.4117756485939026, - "lr": 9.467024539877301e-06, - "objective/entropy": -154.52528381347656, - "objective/kl": 34.40885925292969, - "objective/non_score_reward": -1.7204430103302002, - "objective/rlhf_reward": -3.958053027034971, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 5.24909782409668, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.552734375, - "step": 834, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9996882677078247 - }, - { - "episode": 13376, - "epoch": 0.08014283831230302, - "loss/policy_avg": 0.25968849658966064, - "lr": 9.466385480572598e-06, - "objective/entropy": -35.48725509643555, - "objective/kl": 48.416969299316406, - "objective/non_score_reward": -2.4208483695983887, - "objective/rlhf_reward": -8.324143612121029, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 7.6608781814575195, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4736328125, - "step": 835, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9974427223205566 - }, - { - "episode": 13392, - "epoch": 0.08023870295143258, - "loss/policy_avg": 0.6013174057006836, - "lr": 9.465746421267893e-06, - "objective/entropy": -131.218994140625, - "objective/kl": 40.460113525390625, - "objective/non_score_reward": -2.023005723953247, - "objective/rlhf_reward": -6.267194564613412, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 5.2574968338012695, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.765625, - "step": 836, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985507726669312 - }, - { - "episode": 13408, - "epoch": 0.08033456759056212, - "loss/policy_avg": 0.024118170142173767, - "lr": 9.46510736196319e-06, - "objective/entropy": -219.2191162109375, - "objective/kl": 37.4605712890625, - "objective/non_score_reward": -1.8730283975601196, - "objective/rlhf_reward": -5.544702480511601, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 4.715839385986328, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.603515625, - "step": 837, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.003450393676758 - }, - { - "episode": 13424, - "epoch": 0.08043043222969168, - "loss/policy_avg": 0.3022631108760834, - "lr": 9.464468302658487e-06, - "objective/entropy": -122.02997589111328, - "objective/kl": 32.87577438354492, - "objective/non_score_reward": -1.6437886953353882, - "objective/rlhf_reward": -5.196552612868649, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 3.3451852798461914, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6328125, - "step": 838, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9992682933807373 - }, - { - "episode": 13440, - "epoch": 0.08052629686882122, - "loss/policy_avg": 0.09435372054576874, - "lr": 9.463829243353784e-06, - "objective/entropy": -228.3193817138672, - "objective/kl": 27.057086944580078, - "objective/non_score_reward": -1.3528543710708618, - "objective/rlhf_reward": -3.7495579771405323, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 64.43006896972656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.642578125, - "step": 839, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9953296184539795 - }, - { - "episode": 13456, - "epoch": 0.08062216150795078, - "loss/policy_avg": 1.2935261726379395, - "lr": 9.46319018404908e-06, - "objective/entropy": -160.080322265625, - "objective/kl": 34.4007568359375, - "objective/non_score_reward": -1.7200379371643066, - "objective/rlhf_reward": -5.538515916376738, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 131.64187622070312, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6015625, - "step": 840, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983408451080322 - }, - { - "episode": 13472, - "epoch": 0.08071802614708032, - "loss/policy_avg": 0.5178288817405701, - "lr": 9.462551124744378e-06, - "objective/entropy": -140.98907470703125, - "objective/kl": 32.42417526245117, - "objective/non_score_reward": -1.621208906173706, - "objective/rlhf_reward": -4.084835386276245, - "objective/scores": 0.6, - "policy/approxkl_avg": 2.9638893604278564, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7421875, - "step": 841, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993921518325806 - }, - { - "episode": 13488, - "epoch": 0.08081389078620987, - "loss/policy_avg": 1.674887776374817, - "lr": 9.461912065439673e-06, - "objective/entropy": -140.6572723388672, - "objective/kl": 33.64493179321289, - "objective/non_score_reward": -1.682246446609497, - "objective/rlhf_reward": -2.3289861440658566, - "objective/scores": 1.1, - "policy/approxkl_avg": 2.7393760681152344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.73828125, - "step": 842, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0029544830322266 - }, - { - "episode": 13504, - "epoch": 0.08090975542533942, - "loss/policy_avg": 0.10809826105833054, - "lr": 9.46127300613497e-06, - "objective/entropy": 33.49109649658203, - "objective/kl": 46.121177673339844, - "objective/non_score_reward": -2.3060591220855713, - "objective/rlhf_reward": -7.399407501491616, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 10.247078895568848, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7578125, - "step": 843, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9997997283935547 - }, - { - "episode": 13520, - "epoch": 0.08100562006446897, - "loss/policy_avg": 0.08235388994216919, - "lr": 9.460633946830267e-06, - "objective/entropy": -232.94918823242188, - "objective/kl": 29.242427825927734, - "objective/non_score_reward": -1.4621214866638184, - "objective/rlhf_reward": -5.848485827445984, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.9668121337890625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.708984375, - "step": 844, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998790979385376 - }, - { - "episode": 13536, - "epoch": 0.08110148470359851, - "loss/policy_avg": 0.15575401484966278, - "lr": 9.459994887525563e-06, - "objective/entropy": -230.47235107421875, - "objective/kl": 39.588829040527344, - "objective/non_score_reward": -1.9794416427612305, - "objective/rlhf_reward": -6.401994669231113, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 3.700314521789551, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.708984375, - "step": 845, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9979462623596191 - }, - { - "episode": 13552, - "epoch": 0.08119734934272807, - "loss/policy_avg": 0.13659973442554474, - "lr": 9.45935582822086e-06, - "objective/entropy": -174.33474731445312, - "objective/kl": 28.351903915405273, - "objective/non_score_reward": -1.4175951480865479, - "objective/rlhf_reward": -2.746661697269651, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 9.24754524230957, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.705078125, - "step": 846, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0036792755126953 - }, - { - "episode": 13568, - "epoch": 0.08129321398185761, - "loss/policy_avg": -0.0010715574026107788, - "lr": 9.458716768916156e-06, - "objective/entropy": -106.94636535644531, - "objective/kl": 43.695289611816406, - "objective/non_score_reward": -2.1847643852233887, - "objective/rlhf_reward": -7.077198391378509, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 4.114851474761963, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5859375, - "step": 847, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0022311210632324 - }, - { - "episode": 13584, - "epoch": 0.08138907862098717, - "loss/policy_avg": -0.020745811983942986, - "lr": 9.458077709611452e-06, - "objective/entropy": -274.30377197265625, - "objective/kl": 29.099441528320312, - "objective/non_score_reward": -1.4549720287322998, - "objective/rlhf_reward": -4.215768191877919, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 5.374234199523926, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.55078125, - "step": 848, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0002601146698 - }, - { - "episode": 13600, - "epoch": 0.08148494326011671, - "loss/policy_avg": 0.08369505405426025, - "lr": 9.45743865030675e-06, - "objective/entropy": -90.9344482421875, - "objective/kl": 32.62782669067383, - "objective/non_score_reward": -1.6313912868499756, - "objective/rlhf_reward": -4.921445462767201, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 2.873699426651001, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.681640625, - "step": 849, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983861446380615 - }, - { - "episode": 13616, - "epoch": 0.08158080789924627, - "loss/policy_avg": 0.12610237300395966, - "lr": 9.456799591002046e-06, - "objective/entropy": -216.01071166992188, - "objective/kl": 31.95155906677246, - "objective/non_score_reward": -1.5975778102874756, - "objective/rlhf_reward": -5.048675945311218, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 17.690187454223633, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.630859375, - "step": 850, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9975669384002686 - }, - { - "episode": 13632, - "epoch": 0.08167667253837581, - "loss/policy_avg": 0.09207138419151306, - "lr": 9.456160531697343e-06, - "objective/entropy": -213.504638671875, - "objective/kl": 33.958152770996094, - "objective/non_score_reward": -1.69790780544281, - "objective/rlhf_reward": -5.413029053298336, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 2.278407096862793, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.712890625, - "step": 851, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982938766479492 - }, - { - "episode": 13648, - "epoch": 0.08177253717750536, - "loss/policy_avg": 0.7879657745361328, - "lr": 9.45552147239264e-06, - "objective/entropy": -179.40536499023438, - "objective/kl": 38.20147705078125, - "objective/non_score_reward": -1.91007399559021, - "objective/rlhf_reward": -6.216463644702998, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 3.275893211364746, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.52734375, - "step": 852, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000800609588623 - }, - { - "episode": 13664, - "epoch": 0.0818684018166349, - "loss/policy_avg": -0.05168546736240387, - "lr": 9.454882413087935e-06, - "objective/entropy": -252.6636505126953, - "objective/kl": 36.603004455566406, - "objective/non_score_reward": -1.8301501274108887, - "objective/rlhf_reward": -5.65874100250064, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.8799333572387695, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.734375, - "step": 853, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000197410583496 - }, - { - "episode": 13680, - "epoch": 0.08196426645576446, - "loss/policy_avg": 0.35540589690208435, - "lr": 9.454243353783232e-06, - "objective/entropy": -263.326171875, - "objective/kl": 31.936683654785156, - "objective/non_score_reward": -1.5968341827392578, - "objective/rlhf_reward": -5.0618239379226395, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 9.10447883605957, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.591796875, - "step": 854, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9981427192687988 - }, - { - "episode": 13696, - "epoch": 0.082060131094894, - "loss/policy_avg": -0.01314299926161766, - "lr": 9.453604294478529e-06, - "objective/entropy": -50.54448699951172, - "objective/kl": 27.010623931884766, - "objective/non_score_reward": -1.3505312204360962, - "objective/rlhf_reward": -4.002124941349029, - "objective/scores": 0.35, - "policy/approxkl_avg": 72.71121215820312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.755859375, - "step": 855, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9988174438476562 - }, - { - "episode": 13712, - "epoch": 0.08215599573402356, - "loss/policy_avg": 0.507459282875061, - "lr": 9.452965235173824e-06, - "objective/entropy": -196.7661590576172, - "objective/kl": 41.39533615112305, - "objective/non_score_reward": -2.0697667598724365, - "objective/rlhf_reward": -6.331655929760869, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 6.633426666259766, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.734375, - "step": 856, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9997053146362305 - }, - { - "episode": 13728, - "epoch": 0.0822518603731531, - "loss/policy_avg": 0.01022842712700367, - "lr": 9.452326175869121e-06, - "objective/entropy": -165.575439453125, - "objective/kl": 28.162111282348633, - "objective/non_score_reward": -1.408105492591858, - "objective/rlhf_reward": -4.253819801894528, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 2.566072463989258, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.673828125, - "step": 857, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994425773620605 - }, - { - "episode": 13744, - "epoch": 0.08234772501228266, - "loss/policy_avg": 0.5199975371360779, - "lr": 9.451687116564418e-06, - "objective/entropy": -191.289794921875, - "objective/kl": 25.639848709106445, - "objective/non_score_reward": -1.2819924354553223, - "objective/rlhf_reward": -3.6121978996121253, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.2938754558563232, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.4873046875, - "step": 858, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9993865489959717 - }, - { - "episode": 13760, - "epoch": 0.0824435896514122, - "loss/policy_avg": -0.09089094400405884, - "lr": 9.451048057259715e-06, - "objective/entropy": -222.6432647705078, - "objective/kl": 35.101905822753906, - "objective/non_score_reward": -1.7550954818725586, - "objective/rlhf_reward": -5.641779580203396, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 1.5215179920196533, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.701171875, - "step": 859, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0004208087921143 - }, - { - "episode": 13776, - "epoch": 0.08253945429054176, - "loss/policy_avg": 0.3994244635105133, - "lr": 9.45040899795501e-06, - "objective/entropy": -232.05795288085938, - "objective/kl": 35.13083267211914, - "objective/non_score_reward": -1.7565417289733887, - "objective/rlhf_reward": -2.6261669158935543, - "objective/scores": 1.1, - "policy/approxkl_avg": 7.337094306945801, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.736328125, - "step": 860, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0000622272491455 - }, - { - "episode": 13792, - "epoch": 0.0826353189296713, - "loss/policy_avg": 0.241072878241539, - "lr": 9.449769938650307e-06, - "objective/entropy": -235.5231475830078, - "objective/kl": 42.96981430053711, - "objective/non_score_reward": -2.1484906673431396, - "objective/rlhf_reward": -6.860629336039224, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 5.666136264801025, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.681640625, - "step": 861, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9980902671813965 - }, - { - "episode": 13808, - "epoch": 0.08273118356880085, - "loss/policy_avg": 0.06892701238393784, - "lr": 9.449130879345604e-06, - "objective/entropy": -43.37392044067383, - "objective/kl": 28.94279670715332, - "objective/non_score_reward": -1.447139859199524, - "objective/rlhf_reward": -4.446923902540832, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 13.682140350341797, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.873046875, - "step": 862, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998504638671875 - }, - { - "episode": 13824, - "epoch": 0.0828270482079304, - "loss/policy_avg": 0.05104389786720276, - "lr": 9.4484918200409e-06, - "objective/entropy": -274.24462890625, - "objective/kl": 26.58008575439453, - "objective/non_score_reward": -1.3290044069290161, - "objective/rlhf_reward": -3.6541578821545704, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 14.673041343688965, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6796875, - "step": 863, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998407244682312 - }, - { - "episode": 13840, - "epoch": 0.08292291284705995, - "loss/policy_avg": 2.0433521270751953, - "lr": 9.447852760736197e-06, - "objective/entropy": -141.08175659179688, - "objective/kl": 38.66474151611328, - "objective/non_score_reward": -1.933237075805664, - "objective/rlhf_reward": -6.282350401492462, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 3.4866771697998047, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.849609375, - "step": 864, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0020241737365723 - }, - { - "episode": 13856, - "epoch": 0.08301877748618951, - "loss/policy_avg": 0.5822303891181946, - "lr": 9.447213701431494e-06, - "objective/entropy": -75.44483184814453, - "objective/kl": 42.41981506347656, - "objective/non_score_reward": -2.1209909915924072, - "objective/rlhf_reward": -7.033365587802276, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 1.0502395629882812, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.763671875, - "step": 865, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0002496242523193 - }, - { - "episode": 13872, - "epoch": 0.08311464212531905, - "loss/policy_avg": 1.5961978435516357, - "lr": 9.44657464212679e-06, - "objective/entropy": -102.62336730957031, - "objective/kl": 32.63710021972656, - "objective/non_score_reward": -1.6318550109863281, - "objective/rlhf_reward": -4.702591176303934, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 44.34449005126953, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.583984375, - "step": 866, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.997666597366333 - }, - { - "episode": 13888, - "epoch": 0.0832105067644486, - "loss/policy_avg": -0.06377097964286804, - "lr": 9.445935582822086e-06, - "objective/entropy": -179.53016662597656, - "objective/kl": 27.1846981048584, - "objective/non_score_reward": -1.3592349290847778, - "objective/rlhf_reward": -3.3142334840455394, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 11.25791072845459, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.67578125, - "step": 867, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001521110534668 - }, - { - "episode": 13904, - "epoch": 0.08330637140357815, - "loss/policy_avg": 0.06122337281703949, - "lr": 9.445296523517383e-06, - "objective/entropy": -160.8975830078125, - "objective/kl": 37.28607940673828, - "objective/non_score_reward": -1.8643040657043457, - "objective/rlhf_reward": -6.131703171759767, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 2.380110263824463, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.685546875, - "step": 868, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9993302822113037 - }, - { - "episode": 13920, - "epoch": 0.0834022360427077, - "loss/policy_avg": 0.06397978216409683, - "lr": 9.44465746421268e-06, - "objective/entropy": -279.75146484375, - "objective/kl": 36.54051971435547, - "objective/non_score_reward": -1.8270260095596313, - "objective/rlhf_reward": -5.3606928093003585, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 9.166413307189941, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6953125, - "step": 869, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998555064201355 - }, - { - "episode": 13936, - "epoch": 0.08349810068183725, - "loss/policy_avg": 0.18339544534683228, - "lr": 9.444018404907977e-06, - "objective/entropy": -197.06088256835938, - "objective/kl": 35.413883209228516, - "objective/non_score_reward": -1.7706942558288574, - "objective/rlhf_reward": -5.420917516172516, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 2.4228196144104004, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.55859375, - "step": 870, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9997130632400513 - }, - { - "episode": 13952, - "epoch": 0.0835939653209668, - "loss/policy_avg": 0.7395508885383606, - "lr": 9.443379345603272e-06, - "objective/entropy": -175.5420684814453, - "objective/kl": 27.310260772705078, - "objective/non_score_reward": -1.3655130863189697, - "objective/rlhf_reward": -3.9057928611903936, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 20.016393661499023, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.744140625, - "step": 871, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9980931282043457 - }, - { - "episode": 13968, - "epoch": 0.08368982996009634, - "loss/policy_avg": 0.11419187486171722, - "lr": 9.442740286298569e-06, - "objective/entropy": -202.19219970703125, - "objective/kl": 26.73446273803711, - "objective/non_score_reward": -1.3367230892181396, - "objective/rlhf_reward": -0.9468923568725582, - "objective/scores": 1.1, - "policy/approxkl_avg": 1.4593892097473145, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.505859375, - "step": 872, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9997605085372925 - }, - { - "episode": 13984, - "epoch": 0.0837856945992259, - "loss/policy_avg": 0.10254265367984772, - "lr": 9.442101226993866e-06, - "objective/entropy": -181.49607849121094, - "objective/kl": 34.489620208740234, - "objective/non_score_reward": -1.7244811058044434, - "objective/rlhf_reward": -5.2938043213525585, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 10.685236930847168, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.572265625, - "step": 873, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0004138946533203 - }, - { - "episode": 14000, - "epoch": 0.08388155923835544, - "loss/policy_avg": -0.11048807948827744, - "lr": 9.441462167689163e-06, - "objective/entropy": -233.58718872070312, - "objective/kl": 27.196325302124023, - "objective/non_score_reward": -1.359816312789917, - "objective/rlhf_reward": -4.080015146468563, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 7.074767112731934, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.599609375, - "step": 874, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000337600708008 - }, - { - "episode": 14016, - "epoch": 0.083977423877485, - "loss/policy_avg": -0.04991217330098152, - "lr": 9.44082310838446e-06, - "objective/entropy": -147.29574584960938, - "objective/kl": 39.145992279052734, - "objective/non_score_reward": -1.9572995901107788, - "objective/rlhf_reward": -6.429198360443115, - "objective/scores": 0.35, - "policy/approxkl_avg": 2.3655714988708496, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.703125, - "step": 875, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0024516582489014 - }, - { - "episode": 14032, - "epoch": 0.08407328851661454, - "loss/policy_avg": 0.018214020878076553, - "lr": 9.440184049079757e-06, - "objective/entropy": -225.25274658203125, - "objective/kl": 28.496929168701172, - "objective/non_score_reward": -1.4248464107513428, - "objective/rlhf_reward": -4.248787502856597, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 8.280494689941406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.78515625, - "step": 876, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0028023719787598 - }, - { - "episode": 14048, - "epoch": 0.0841691531557441, - "loss/policy_avg": -0.0712839737534523, - "lr": 9.439544989775052e-06, - "objective/entropy": -111.49925231933594, - "objective/kl": 33.307395935058594, - "objective/non_score_reward": -1.6653697490692139, - "objective/rlhf_reward": -5.237647135456172, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 10.649118423461914, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.744140625, - "step": 877, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0005428791046143 - }, - { - "episode": 14064, - "epoch": 0.08426501779487364, - "loss/policy_avg": 0.416260302066803, - "lr": 9.438905930470349e-06, - "objective/entropy": -91.5921630859375, - "objective/kl": 36.07551193237305, - "objective/non_score_reward": -1.8037755489349365, - "objective/rlhf_reward": -5.658842890468195, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 14.971528053283691, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.546875, - "step": 878, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.99779212474823 - }, - { - "episode": 14080, - "epoch": 0.08436088243400319, - "loss/policy_avg": 0.6945221424102783, - "lr": 9.438266871165644e-06, - "objective/entropy": -103.2996597290039, - "objective/kl": 29.02838706970215, - "objective/non_score_reward": -1.4514193534851074, - "objective/rlhf_reward": -4.249418287482813, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 3.5951104164123535, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6015625, - "step": 879, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9996235370635986 - }, - { - "episode": 14096, - "epoch": 0.08445674707313273, - "loss/policy_avg": 0.14096970856189728, - "lr": 9.43762781186094e-06, - "objective/entropy": -250.6915283203125, - "objective/kl": 24.03522491455078, - "objective/non_score_reward": -1.2017613649368286, - "objective/rlhf_reward": -3.4284433508790553, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 141.8468017578125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.681640625, - "step": 880, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9993736743927002 - }, - { - "episode": 14112, - "epoch": 0.08455261171226229, - "loss/policy_avg": 0.3699185848236084, - "lr": 9.436988752556238e-06, - "objective/entropy": -159.3045196533203, - "objective/kl": 40.019386291503906, - "objective/non_score_reward": -2.000969409942627, - "objective/rlhf_reward": -6.553278903575286, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 8.20317554473877, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.775390625, - "step": 881, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9977927207946777 - }, - { - "episode": 14128, - "epoch": 0.08464847635139183, - "loss/policy_avg": 0.41995298862457275, - "lr": 9.436349693251534e-06, - "objective/entropy": 76.95626068115234, - "objective/kl": 39.00627899169922, - "objective/non_score_reward": -1.9503138065338135, - "objective/rlhf_reward": -6.139395838201629, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 31.75859832763672, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.568359375, - "step": 882, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9953019618988037 - }, - { - "episode": 14144, - "epoch": 0.08474434099052139, - "loss/policy_avg": 0.5355075597763062, - "lr": 9.435710633946831e-06, - "objective/entropy": -164.35186767578125, - "objective/kl": 42.27740478515625, - "objective/non_score_reward": -2.113870143890381, - "objective/rlhf_reward": -7.113845041304259, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 20.66805648803711, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.72265625, - "step": 883, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9979139566421509 - }, - { - "episode": 14160, - "epoch": 0.08484020562965093, - "loss/policy_avg": 0.12046757340431213, - "lr": 9.435071574642126e-06, - "objective/entropy": -139.48226928710938, - "objective/kl": 35.96446228027344, - "objective/non_score_reward": -1.7982230186462402, - "objective/rlhf_reward": -5.833642208312435, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 5.584999084472656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.763671875, - "step": 884, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985802173614502 - }, - { - "episode": 14176, - "epoch": 0.08493607026878049, - "loss/policy_avg": 0.20259422063827515, - "lr": 9.434432515337423e-06, - "objective/entropy": -194.32472229003906, - "objective/kl": 29.422592163085938, - "objective/non_score_reward": -1.4711295366287231, - "objective/rlhf_reward": -4.151184813181559, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 7.590093612670898, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.705078125, - "step": 885, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0000078678131104 - }, - { - "episode": 14192, - "epoch": 0.08503193490791003, - "loss/policy_avg": 0.38378089666366577, - "lr": 9.43379345603272e-06, - "objective/entropy": -204.80718994140625, - "objective/kl": 26.858444213867188, - "objective/non_score_reward": -1.3429222106933594, - "objective/rlhf_reward": -3.947856862743465, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 54.279869079589844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8828125, - "step": 886, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000415086746216 - }, - { - "episode": 14208, - "epoch": 0.08512779954703958, - "loss/policy_avg": 0.27804744243621826, - "lr": 9.433154396728017e-06, - "objective/entropy": -216.76026916503906, - "objective/kl": 31.35245704650879, - "objective/non_score_reward": -1.5676229000091553, - "objective/rlhf_reward": -4.928855529337554, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 44.15214157104492, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.841796875, - "step": 887, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9985570907592773 - }, - { - "episode": 14224, - "epoch": 0.08522366418616913, - "loss/policy_avg": 0.1285010725259781, - "lr": 9.432515337423314e-06, - "objective/entropy": -256.2292175292969, - "objective/kl": 22.457351684570312, - "objective/non_score_reward": -1.1228675842285156, - "objective/rlhf_reward": -2.5440589291619613, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.694319725036621, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.591796875, - "step": 888, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 1.9982428550720215 - }, - { - "episode": 14240, - "epoch": 0.08531952882529868, - "loss/policy_avg": 0.1620079129934311, - "lr": 9.431876278118611e-06, - "objective/entropy": -246.3665313720703, - "objective/kl": 32.27862548828125, - "objective/non_score_reward": -1.6139311790466309, - "objective/rlhf_reward": -5.03189285536584, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 4.128833293914795, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.626953125, - "step": 889, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0017335414886475 - }, - { - "episode": 14256, - "epoch": 0.08541539346442822, - "loss/policy_avg": 0.6714350581169128, - "lr": 9.431237218813906e-06, - "objective/entropy": -87.00444793701172, - "objective/kl": 30.12160873413086, - "objective/non_score_reward": -1.5060807466506958, - "objective/rlhf_reward": -4.600490768154231, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 30.543041229248047, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.681640625, - "step": 890, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9955544471740723 - }, - { - "episode": 14272, - "epoch": 0.08551125810355778, - "loss/policy_avg": 0.5368032455444336, - "lr": 9.430598159509203e-06, - "objective/entropy": -151.2410125732422, - "objective/kl": 23.1306095123291, - "objective/non_score_reward": -1.1565306186676025, - "objective/rlhf_reward": -3.266872340176983, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 18.648775100708008, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 891, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999136209487915 - }, - { - "episode": 14288, - "epoch": 0.08560712274268732, - "loss/policy_avg": -0.4043048024177551, - "lr": 9.4299591002045e-06, - "objective/entropy": -214.12281799316406, - "objective/kl": 38.173484802246094, - "objective/non_score_reward": -1.9086743593215942, - "objective/rlhf_reward": -5.972837810934173, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 3.6675243377685547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.580078125, - "step": 892, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000812292098999 - }, - { - "episode": 14304, - "epoch": 0.08570298738181688, - "loss/policy_avg": 1.0885683298110962, - "lr": 9.429320040899797e-06, - "objective/entropy": -234.37998962402344, - "objective/kl": 27.995094299316406, - "objective/non_score_reward": -1.3997547626495361, - "objective/rlhf_reward": -4.239769363139553, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.2649099826812744, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6640625, - "step": 893, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0003528594970703 - }, - { - "episode": 14320, - "epoch": 0.08579885202094642, - "loss/policy_avg": -0.1013278141617775, - "lr": 9.428680981595094e-06, - "objective/entropy": -156.33245849609375, - "objective/kl": 35.587982177734375, - "objective/non_score_reward": -1.779399037361145, - "objective/rlhf_reward": -5.738993861762387, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 8.389669418334961, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.666015625, - "step": 894, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.99942946434021 - }, - { - "episode": 14336, - "epoch": 0.08589471666007598, - "loss/policy_avg": -0.006531273480504751, - "lr": 9.42804192229039e-06, - "objective/entropy": -197.26820373535156, - "objective/kl": 21.04766082763672, - "objective/non_score_reward": -1.0523829460144043, - "objective/rlhf_reward": -2.6532727172046453, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 1.4280903339385986, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.69140625, - "step": 895, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001786708831787 - }, - { - "episode": 14352, - "epoch": 0.08599058129920552, - "loss/policy_avg": 0.10259456932544708, - "lr": 9.427402862985686e-06, - "objective/entropy": -120.49540710449219, - "objective/kl": 37.17432403564453, - "objective/non_score_reward": -1.858716368675232, - "objective/rlhf_reward": -3.034865355491638, - "objective/scores": 1.1, - "policy/approxkl_avg": 6.6070685386657715, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.615234375, - "step": 896, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999729871749878 - }, - { - "episode": 14368, - "epoch": 0.08608644593833507, - "loss/policy_avg": 0.18344524502754211, - "lr": 9.426763803680982e-06, - "objective/entropy": -84.0172348022461, - "objective/kl": 32.38622283935547, - "objective/non_score_reward": -1.6193112134933472, - "objective/rlhf_reward": -5.117994987700863, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 33.82829284667969, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.529296875, - "step": 897, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9998817443847656 - }, - { - "episode": 14384, - "epoch": 0.08618231057746462, - "loss/policy_avg": 0.7863380312919617, - "lr": 9.42612474437628e-06, - "objective/entropy": -94.4057388305664, - "objective/kl": 31.75823402404785, - "objective/non_score_reward": -1.58791184425354, - "objective/rlhf_reward": -3.4279281839143962, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 3.406008243560791, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.8125, - "step": 898, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0038909912109375 - }, - { - "episode": 14400, - "epoch": 0.08627817521659417, - "loss/policy_avg": 0.5351603031158447, - "lr": 9.425485685071576e-06, - "objective/entropy": -265.2181396484375, - "objective/kl": 29.21182632446289, - "objective/non_score_reward": -1.460591197013855, - "objective/rlhf_reward": -4.1090314547220865, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 4.054888725280762, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.57421875, - "step": 899, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998013973236084 - }, - { - "episode": 14416, - "epoch": 0.08637403985572371, - "loss/policy_avg": 0.013534091413021088, - "lr": 9.424846625766873e-06, - "objective/entropy": -194.56564331054688, - "objective/kl": 24.970386505126953, - "objective/non_score_reward": -1.2485194206237793, - "objective/rlhf_reward": -3.6348278162225913, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 0.42985397577285767, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.53515625, - "step": 900, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0002646446228027 - }, - { - "episode": 14432, - "epoch": 0.08646990449485327, - "loss/policy_avg": -0.026506464928388596, - "lr": 9.424207566462168e-06, - "objective/entropy": -121.82954406738281, - "objective/kl": 38.97528839111328, - "objective/non_score_reward": -1.9487645626068115, - "objective/rlhf_reward": -7.795057892799377, - "objective/scores": 0.0, - "policy/approxkl_avg": 18.97709846496582, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.3564453125, - "step": 901, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9988110065460205 - }, - { - "episode": 14448, - "epoch": 0.08656576913398281, - "loss/policy_avg": 0.04643288254737854, - "lr": 9.423568507157465e-06, - "objective/entropy": -97.38468170166016, - "objective/kl": 28.042333602905273, - "objective/non_score_reward": -1.4021167755126953, - "objective/rlhf_reward": -4.184634823997585, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 2.1407618522644043, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.60546875, - "step": 902, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9999892711639404 - }, - { - "episode": 14464, - "epoch": 0.08666163377311237, - "loss/policy_avg": 0.5154027342796326, - "lr": 9.42292944785276e-06, - "objective/entropy": -250.2370147705078, - "objective/kl": 25.91543960571289, - "objective/non_score_reward": -1.2957720756530762, - "objective/rlhf_reward": -3.759255845745174, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 1.9840008020401, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.607421875, - "step": 903, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9984569549560547 - }, - { - "episode": 14480, - "epoch": 0.08675749841224191, - "loss/policy_avg": -0.12090878188610077, - "lr": 9.422290388548057e-06, - "objective/entropy": -224.9342041015625, - "objective/kl": 21.860130310058594, - "objective/non_score_reward": -1.0930064916610718, - "objective/rlhf_reward": -2.42461485691541, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 6.253545761108398, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.751953125, - "step": 904, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000030755996704 - }, - { - "episode": 14496, - "epoch": 0.08685336305137147, - "loss/policy_avg": 0.2192097306251526, - "lr": 9.421651329243354e-06, - "objective/entropy": -116.75704956054688, - "objective/kl": 40.641937255859375, - "objective/non_score_reward": -2.0320968627929688, - "objective/rlhf_reward": -6.786751320868163, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 3.1222383975982666, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.525390625, - "step": 905, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9990143775939941 - }, - { - "episode": 14512, - "epoch": 0.08694922769050101, - "loss/policy_avg": 0.014911421574652195, - "lr": 9.421012269938651e-06, - "objective/entropy": -169.34967041015625, - "objective/kl": 19.47471809387207, - "objective/non_score_reward": -0.9737359285354614, - "objective/rlhf_reward": -1.7722373626389838, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 3.2120165824890137, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65234375, - "step": 906, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0021252632141113 - }, - { - "episode": 14528, - "epoch": 0.08704509232963056, - "loss/policy_avg": -0.06861399114131927, - "lr": 9.420373210633948e-06, - "objective/entropy": -199.73748779296875, - "objective/kl": 32.33728790283203, - "objective/non_score_reward": -1.6168644428253174, - "objective/rlhf_reward": -5.088855722037655, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 7.329561233520508, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.58203125, - "step": 907, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.1414122581481934 - }, - { - "episode": 14544, - "epoch": 0.0871409569687601, - "loss/policy_avg": -0.0006491807289421558, - "lr": 9.419734151329245e-06, - "objective/entropy": -241.5037078857422, - "objective/kl": 26.676612854003906, - "objective/non_score_reward": -1.3338308334350586, - "objective/rlhf_reward": -0.9353229761123654, - "objective/scores": 1.1, - "policy/approxkl_avg": 2.882882595062256, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.568359375, - "step": 908, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9995675086975098 - }, - { - "episode": 14560, - "epoch": 0.08723682160788966, - "loss/policy_avg": -0.30844664573669434, - "lr": 9.41909509202454e-06, - "objective/entropy": -193.48281860351562, - "objective/kl": 32.22890853881836, - "objective/non_score_reward": -1.6114455461502075, - "objective/rlhf_reward": -4.712448493639627, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 9.586688995361328, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.537109375, - "step": 909, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0032057762145996 - }, - { - "episode": 14576, - "epoch": 0.0873326862470192, - "loss/policy_avg": 0.10456671565771103, - "lr": 9.418456032719837e-06, - "objective/entropy": -214.8862762451172, - "objective/kl": 30.845359802246094, - "objective/non_score_reward": -1.5422677993774414, - "objective/rlhf_reward": -4.769071197509765, - "objective/scores": 0.35, - "policy/approxkl_avg": 48.766883850097656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.560546875, - "step": 910, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0017874240875244 - }, - { - "episode": 14592, - "epoch": 0.08742855088614876, - "loss/policy_avg": 0.011322952806949615, - "lr": 9.417816973415134e-06, - "objective/entropy": -148.18869018554688, - "objective/kl": 34.653785705566406, - "objective/non_score_reward": -1.7326891422271729, - "objective/rlhf_reward": -5.5069247080880075, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 2.303962230682373, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.740234375, - "step": 911, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.001537561416626 - }, - { - "episode": 14608, - "epoch": 0.0875244155252783, - "loss/policy_avg": 1.4446654319763184, - "lr": 9.41717791411043e-06, - "objective/entropy": -151.7039337158203, - "objective/kl": 36.139678955078125, - "objective/non_score_reward": -1.8069840669631958, - "objective/rlhf_reward": -5.623816165987568, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 8.342704772949219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7421875, - "step": 912, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.997730016708374 - }, - { - "episode": 14624, - "epoch": 0.08762028016440786, - "loss/policy_avg": 0.007501431740820408, - "lr": 9.416538854805727e-06, - "objective/entropy": -192.82723999023438, - "objective/kl": 28.006526947021484, - "objective/non_score_reward": -1.4003264904022217, - "objective/rlhf_reward": -3.776477153572153, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 9.397720336914062, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.74609375, - "step": 913, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9988361597061157 - }, - { - "episode": 14640, - "epoch": 0.0877161448035374, - "loss/policy_avg": 0.7067223787307739, - "lr": 9.415899795501023e-06, - "objective/entropy": -199.13888549804688, - "objective/kl": 40.245330810546875, - "objective/non_score_reward": -2.0122666358947754, - "objective/rlhf_reward": -6.387206798017608, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.6032171249389648, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.626953125, - "step": 914, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0032029151916504 - }, - { - "episode": 14656, - "epoch": 0.08781200944266696, - "loss/policy_avg": 0.7447987198829651, - "lr": 9.41526073619632e-06, - "objective/entropy": -192.03024291992188, - "objective/kl": 33.84302520751953, - "objective/non_score_reward": -1.6921510696411133, - "objective/rlhf_reward": -5.212345330920771, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 12.58854866027832, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.708984375, - "step": 915, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982028007507324 - }, - { - "episode": 14672, - "epoch": 0.0879078740817965, - "loss/policy_avg": -0.12448902428150177, - "lr": 9.414621676891616e-06, - "objective/entropy": -108.39199829101562, - "objective/kl": 27.540185928344727, - "objective/non_score_reward": -1.3770092725753784, - "objective/rlhf_reward": -3.3853308580079418, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 0.6809393763542175, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.771484375, - "step": 916, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0027718544006348 - }, - { - "episode": 14688, - "epoch": 0.08800373872092605, - "loss/policy_avg": 0.09778769314289093, - "lr": 9.413982617586913e-06, - "objective/entropy": -83.20165252685547, - "objective/kl": 27.68124008178711, - "objective/non_score_reward": -1.3840619325637817, - "objective/rlhf_reward": -3.7114191010323276, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 7.001269340515137, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015625, - "step": 917, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9994606971740723 - }, - { - "episode": 14704, - "epoch": 0.0880996033600556, - "loss/policy_avg": 0.7267050743103027, - "lr": 9.41334355828221e-06, - "objective/entropy": -174.48663330078125, - "objective/kl": 32.38935089111328, - "objective/non_score_reward": -1.6194674968719482, - "objective/rlhf_reward": -6.477869987487793, - "objective/scores": 0.0, - "policy/approxkl_avg": 9.753436088562012, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6796875, - "step": 918, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9973618984222412 - }, - { - "episode": 14720, - "epoch": 0.08819546799918515, - "loss/policy_avg": 0.18099595606327057, - "lr": 9.412704498977507e-06, - "objective/entropy": -232.4264373779297, - "objective/kl": 37.20670700073242, - "objective/non_score_reward": -1.860335350036621, - "objective/rlhf_reward": -6.115828309088869, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 41.893341064453125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.79296875, - "step": 919, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.002889633178711 - }, - { - "episode": 14736, - "epoch": 0.08829133263831469, - "loss/policy_avg": 0.43639302253723145, - "lr": 9.412065439672802e-06, - "objective/entropy": -183.69644165039062, - "objective/kl": 24.13558006286621, - "objective/non_score_reward": -1.2067790031433105, - "objective/rlhf_reward": -2.8797047836350753, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 30.2447509765625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.55078125, - "step": 920, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9992506504058838 - }, - { - "episode": 14752, - "epoch": 0.08838719727744425, - "loss/policy_avg": 0.5567411780357361, - "lr": 9.411426380368099e-06, - "objective/entropy": -285.06512451171875, - "objective/kl": 32.89839553833008, - "objective/non_score_reward": -1.644919753074646, - "objective/rlhf_reward": -4.8463457385698945, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 18.15423583984375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.609375, - "step": 921, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997804880142212 - }, - { - "episode": 14768, - "epoch": 0.0884830619165738, - "loss/policy_avg": -0.024971559643745422, - "lr": 9.410787321063396e-06, - "objective/entropy": -144.00473022460938, - "objective/kl": 27.061277389526367, - "objective/non_score_reward": -1.353063941001892, - "objective/rlhf_reward": -4.033653714743954, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 1.530630111694336, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 922, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0000722408294678 - }, - { - "episode": 14784, - "epoch": 0.08857892655570335, - "loss/policy_avg": -0.49618157744407654, - "lr": 9.410148261758691e-06, - "objective/entropy": -37.43824768066406, - "objective/kl": 35.81788635253906, - "objective/non_score_reward": -1.7908943891525269, - "objective/rlhf_reward": -5.501717870653259, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 4.131357192993164, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.4287109375, - "step": 923, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0173897743225098 - }, - { - "episode": 14800, - "epoch": 0.0886747911948329, - "loss/policy_avg": 0.05783979594707489, - "lr": 9.409509202453988e-06, - "objective/entropy": -154.13516235351562, - "objective/kl": 46.57448196411133, - "objective/non_score_reward": -2.3287243843078613, - "objective/rlhf_reward": -7.653037791669952, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 2.5200886726379395, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5390625, - "step": 924, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9987819194793701 - }, - { - "episode": 14816, - "epoch": 0.08877065583396244, - "loss/policy_avg": 0.034926094114780426, - "lr": 9.408870143149285e-06, - "objective/entropy": -221.52577209472656, - "objective/kl": 35.47760772705078, - "objective/non_score_reward": -1.7738804817199707, - "objective/rlhf_reward": -5.614569070752024, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 1.4324332475662231, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66796875, - "step": 925, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9995288848876953 - }, - { - "episode": 14832, - "epoch": 0.088866520473092, - "loss/policy_avg": 0.32427555322647095, - "lr": 9.408231083844582e-06, - "objective/entropy": -130.25445556640625, - "objective/kl": 34.63972473144531, - "objective/non_score_reward": -1.7319860458374023, - "objective/rlhf_reward": -5.371684878078058, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 3.6408345699310303, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.568359375, - "step": 926, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0012738704681396 - }, - { - "episode": 14848, - "epoch": 0.08896238511222154, - "loss/policy_avg": -0.27763280272483826, - "lr": 9.407592024539877e-06, - "objective/entropy": -244.65667724609375, - "objective/kl": 27.930646896362305, - "objective/non_score_reward": -1.396532416343689, - "objective/rlhf_reward": -3.7613009765473118, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 65.45894622802734, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.66015625, - "step": 927, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.002328872680664 - }, - { - "episode": 14864, - "epoch": 0.0890582497513511, - "loss/policy_avg": 0.39164024591445923, - "lr": 9.406952965235174e-06, - "objective/entropy": -92.6754150390625, - "objective/kl": 40.35970687866211, - "objective/non_score_reward": -2.0179853439331055, - "objective/rlhf_reward": -5.148222361446592, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 9.222280502319336, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.587890625, - "step": 928, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9965031147003174 - }, - { - "episode": 14880, - "epoch": 0.08915411439048064, - "loss/policy_avg": 0.018820755183696747, - "lr": 9.40631390593047e-06, - "objective/entropy": -221.75802612304688, - "objective/kl": 32.733848571777344, - "objective/non_score_reward": -1.6366922855377197, - "objective/rlhf_reward": -4.942649397913533, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 1.5601739883422852, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.67578125, - "step": 929, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0016555786132812 - }, - { - "episode": 14896, - "epoch": 0.0892499790296102, - "loss/policy_avg": 0.02956710010766983, - "lr": 9.405674846625768e-06, - "objective/entropy": -225.1991729736328, - "objective/kl": 27.00541114807129, - "objective/non_score_reward": -1.3502705097198486, - "objective/rlhf_reward": -4.059446623831421, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 6.755413055419922, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8046875, - "step": 930, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001051664352417 - }, - { - "episode": 14912, - "epoch": 0.08934584366873974, - "loss/policy_avg": 0.08642945438623428, - "lr": 9.405035787321065e-06, - "objective/entropy": -179.3356475830078, - "objective/kl": 36.390193939208984, - "objective/non_score_reward": -1.8195096254348755, - "objective/rlhf_reward": -5.330627392010625, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 10.583852767944336, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4501953125, - "step": 931, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999453067779541 - }, - { - "episode": 14928, - "epoch": 0.0894417083078693, - "loss/policy_avg": 0.08466912060976028, - "lr": 9.404396728016361e-06, - "objective/entropy": -160.34024047851562, - "objective/kl": 48.99607849121094, - "objective/non_score_reward": -2.4498043060302734, - "objective/rlhf_reward": -8.195096407000142, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 0.9886335134506226, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.666015625, - "step": 932, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0025954246520996 - }, - { - "episode": 14944, - "epoch": 0.08953757294699884, - "loss/policy_avg": 0.3508598804473877, - "lr": 9.403757668711657e-06, - "objective/entropy": -177.20993041992188, - "objective/kl": 32.381324768066406, - "objective/non_score_reward": -1.6190659999847412, - "objective/rlhf_reward": -5.150751504927797, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 31.277324676513672, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.4716796875, - "step": 933, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9997992515563965 - }, - { - "episode": 14960, - "epoch": 0.08963343758612839, - "loss/policy_avg": 0.11015394330024719, - "lr": 9.403118609406953e-06, - "objective/entropy": -203.39776611328125, - "objective/kl": 32.743534088134766, - "objective/non_score_reward": -1.637176752090454, - "objective/rlhf_reward": -4.94458726412447, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 2.4484572410583496, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.734375, - "step": 934, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9993422031402588 - }, - { - "episode": 14976, - "epoch": 0.08972930222525793, - "loss/policy_avg": -0.10944172739982605, - "lr": 9.40247955010225e-06, - "objective/entropy": -241.4989013671875, - "objective/kl": 21.90488052368164, - "objective/non_score_reward": -1.0952439308166504, - "objective/rlhf_reward": -3.0217259762033652, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 3.7654926776885986, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.689453125, - "step": 935, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.002392292022705 - }, - { - "episode": 14992, - "epoch": 0.08982516686438749, - "loss/policy_avg": 0.9405217170715332, - "lr": 9.401840490797547e-06, - "objective/entropy": -237.89816284179688, - "objective/kl": 25.436769485473633, - "objective/non_score_reward": -1.2718384265899658, - "objective/rlhf_reward": -3.663521905143825, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 6.3816022872924805, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.60546875, - "step": 936, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9998328685760498 - }, - { - "episode": 15008, - "epoch": 0.08992103150351703, - "loss/policy_avg": 0.3327906131744385, - "lr": 9.401201431492844e-06, - "objective/entropy": -268.6925354003906, - "objective/kl": 37.998870849609375, - "objective/non_score_reward": -1.899943470954895, - "objective/rlhf_reward": -6.0435144593387395, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 10.16036605834961, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.607421875, - "step": 937, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9984662532806396 - }, - { - "episode": 15024, - "epoch": 0.09001689614264659, - "loss/policy_avg": -0.26467132568359375, - "lr": 9.40056237218814e-06, - "objective/entropy": -231.59254455566406, - "objective/kl": 26.266529083251953, - "objective/non_score_reward": -1.3133264780044556, - "objective/rlhf_reward": -3.737534248622593, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 8.63685417175293, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6328125, - "step": 938, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999743938446045 - }, - { - "episode": 15040, - "epoch": 0.09011276078177613, - "loss/policy_avg": 0.2447420209646225, - "lr": 9.399923312883436e-06, - "objective/entropy": -278.01153564453125, - "objective/kl": 27.628671646118164, - "objective/non_score_reward": -1.3814334869384766, - "objective/rlhf_reward": -4.147132196513516, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 2.7261061668395996, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.669921875, - "step": 939, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9990365505218506 - }, - { - "episode": 15056, - "epoch": 0.09020862542090569, - "loss/policy_avg": 0.2600797414779663, - "lr": 9.399284253578733e-06, - "objective/entropy": -242.6852264404297, - "objective/kl": 40.91444396972656, - "objective/non_score_reward": -2.045722484588623, - "objective/rlhf_reward": -6.060183467642341, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 7.501818656921387, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484375, - "step": 940, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9971075057983398 - }, - { - "episode": 15072, - "epoch": 0.09030449006003523, - "loss/policy_avg": 0.3729836940765381, - "lr": 9.39864519427403e-06, - "objective/entropy": -225.56338500976562, - "objective/kl": 34.106658935546875, - "objective/non_score_reward": -1.7053331136703491, - "objective/rlhf_reward": -5.340379836972117, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 3.6144325733184814, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.73828125, - "step": 941, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9977765083312988 - }, - { - "episode": 15088, - "epoch": 0.09040035469916478, - "loss/policy_avg": 0.571183443069458, - "lr": 9.398006134969327e-06, - "objective/entropy": -109.51638793945312, - "objective/kl": 57.49871826171875, - "objective/non_score_reward": -2.8749358654022217, - "objective/rlhf_reward": -9.895623478952961, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 5.06275749206543, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.736328125, - "step": 942, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000272512435913 - }, - { - "episode": 15104, - "epoch": 0.09049621933829433, - "loss/policy_avg": 0.7253443002700806, - "lr": 9.397367075664624e-06, - "objective/entropy": -69.86570739746094, - "objective/kl": 40.12030029296875, - "objective/non_score_reward": -2.0060153007507324, - "objective/rlhf_reward": -6.362201397836792, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 66.08172607421875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015625, - "step": 943, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997114896774292 - }, - { - "episode": 15120, - "epoch": 0.09059208397742388, - "loss/policy_avg": 0.7548943758010864, - "lr": 9.396728016359919e-06, - "objective/entropy": -264.1029357910156, - "objective/kl": 29.125934600830078, - "objective/non_score_reward": -1.456296682357788, - "objective/rlhf_reward": -4.268927424159601, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 1.555539846420288, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6328125, - "step": 944, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0007224082946777 - }, - { - "episode": 15136, - "epoch": 0.09068794861655342, - "loss/policy_avg": -0.06224450469017029, - "lr": 9.396088957055216e-06, - "objective/entropy": -215.80255126953125, - "objective/kl": 36.1290283203125, - "objective/non_score_reward": -1.8064515590667725, - "objective/rlhf_reward": -5.7100345728718604, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.062628746032715, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.5703125, - "step": 945, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0022828578948975 - }, - { - "episode": 15152, - "epoch": 0.09078381325568298, - "loss/policy_avg": -0.34320878982543945, - "lr": 9.395449897750511e-06, - "objective/entropy": -254.14260864257812, - "objective/kl": 24.163818359375, - "objective/non_score_reward": -1.20819091796875, - "objective/rlhf_reward": -2.4327639102935787, - "objective/scores": 0.6, - "policy/approxkl_avg": 3.011139392852783, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.62109375, - "step": 946, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0023422241210938 - }, - { - "episode": 15168, - "epoch": 0.09087967789481252, - "loss/policy_avg": 0.08071097731590271, - "lr": 9.394810838445808e-06, - "objective/entropy": -269.91180419921875, - "objective/kl": 29.857431411743164, - "objective/non_score_reward": -1.4928715229034424, - "objective/rlhf_reward": -3.8487801573434215, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 7.305149555206299, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.646484375, - "step": 947, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9971578121185303 - }, - { - "episode": 15184, - "epoch": 0.09097554253394208, - "loss/policy_avg": -0.019624732434749603, - "lr": 9.394171779141105e-06, - "objective/entropy": -274.10198974609375, - "objective/kl": 33.219993591308594, - "objective/non_score_reward": -1.6609996557235718, - "objective/rlhf_reward": -6.643998503684998, - "objective/scores": 0.0, - "policy/approxkl_avg": 4.708046913146973, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015625, - "step": 948, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000514030456543 - }, - { - "episode": 15200, - "epoch": 0.09107140717307162, - "loss/policy_avg": -0.5435956716537476, - "lr": 9.393532719836402e-06, - "objective/entropy": -245.58270263671875, - "objective/kl": 26.876476287841797, - "objective/non_score_reward": -1.3438239097595215, - "objective/rlhf_reward": -3.771175924603062, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 10.178674697875977, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.630859375, - "step": 949, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0033979415893555 - }, - { - "episode": 15216, - "epoch": 0.09116727181220118, - "loss/policy_avg": 0.6083466410636902, - "lr": 9.392893660531698e-06, - "objective/entropy": -169.32357788085938, - "objective/kl": 38.449127197265625, - "objective/non_score_reward": -1.9224563837051392, - "objective/rlhf_reward": -6.133565931525782, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 8.572129249572754, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.685546875, - "step": 950, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000035285949707 - }, - { - "episode": 15232, - "epoch": 0.09126313645133072, - "loss/policy_avg": 0.1515914499759674, - "lr": 9.392254601226994e-06, - "objective/entropy": -181.75010681152344, - "objective/kl": 31.95659637451172, - "objective/non_score_reward": -1.5978299379348755, - "objective/rlhf_reward": -5.04968385985437, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 12.761173248291016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.798828125, - "step": 951, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9977487325668335 - }, - { - "episode": 15248, - "epoch": 0.09135900109046027, - "loss/policy_avg": 0.7638048529624939, - "lr": 9.39161554192229e-06, - "objective/entropy": -158.99050903320312, - "objective/kl": 39.69103240966797, - "objective/non_score_reward": -1.9845517873764038, - "objective/rlhf_reward": -5.815500917212043, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 11.06544303894043, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.541015625, - "step": 952, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9981579780578613 - }, - { - "episode": 15264, - "epoch": 0.09145486572958982, - "loss/policy_avg": 0.764492392539978, - "lr": 9.390976482617587e-06, - "objective/entropy": -159.26947021484375, - "objective/kl": 28.415475845336914, - "objective/non_score_reward": -1.4207737445831299, - "objective/rlhf_reward": -5.683095276355743, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.907594680786133, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6328125, - "step": 953, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997883677482605 - }, - { - "episode": 15280, - "epoch": 0.09155073036871937, - "loss/policy_avg": 0.3368009328842163, - "lr": 9.390337423312884e-06, - "objective/entropy": -173.85415649414062, - "objective/kl": 35.513309478759766, - "objective/non_score_reward": -1.775665521621704, - "objective/rlhf_reward": -5.49854234224947, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 6.337751388549805, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.541015625, - "step": 954, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9994667768478394 - }, - { - "episode": 15296, - "epoch": 0.09164659500784891, - "loss/policy_avg": 0.0456845797598362, - "lr": 9.389698364008181e-06, - "objective/entropy": 16.725250244140625, - "objective/kl": 36.44686508178711, - "objective/non_score_reward": -1.822343349456787, - "objective/rlhf_reward": -5.865541179378596, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 21.832763671875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.548828125, - "step": 955, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999651551246643 - }, - { - "episode": 15312, - "epoch": 0.09174245964697847, - "loss/policy_avg": 0.0268879272043705, - "lr": 9.389059304703478e-06, - "objective/entropy": -219.0832977294922, - "objective/kl": 25.021286010742188, - "objective/non_score_reward": -1.2510643005371094, - "objective/rlhf_reward": -3.1794285133209934, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 13.525361061096191, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.673828125, - "step": 956, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0001580715179443 - }, - { - "episode": 15328, - "epoch": 0.09183832428610801, - "loss/policy_avg": 0.25198429822921753, - "lr": 9.388420245398773e-06, - "objective/entropy": -216.4515838623047, - "objective/kl": 29.98337173461914, - "objective/non_score_reward": -1.4991683959960938, - "objective/rlhf_reward": -3.0729548081171245, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 8.199630737304688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.634765625, - "step": 957, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9965362548828125 - }, - { - "episode": 15344, - "epoch": 0.09193418892523757, - "loss/policy_avg": 0.035516731441020966, - "lr": 9.38778118609407e-06, - "objective/entropy": -250.8704833984375, - "objective/kl": 30.556961059570312, - "objective/non_score_reward": -1.5278480052947998, - "objective/rlhf_reward": -4.73278991231094, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 3.100607395172119, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.607421875, - "step": 958, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0005221366882324 - }, - { - "episode": 15360, - "epoch": 0.09203005356436711, - "loss/policy_avg": 0.6594608426094055, - "lr": 9.387142126789367e-06, - "objective/entropy": -190.2021942138672, - "objective/kl": 29.693756103515625, - "objective/non_score_reward": -1.4846878051757812, - "objective/rlhf_reward": -4.38249173661764, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 11.999906539916992, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.55859375, - "step": 959, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9964005947113037 - }, - { - "episode": 15376, - "epoch": 0.09212591820349667, - "loss/policy_avg": 0.16847842931747437, - "lr": 9.386503067484664e-06, - "objective/entropy": -220.72311401367188, - "objective/kl": 22.618806838989258, - "objective/non_score_reward": -1.1309404373168945, - "objective/rlhf_reward": -3.0731633110955805, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 1.5775080919265747, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.63671875, - "step": 960, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0047149658203125 - }, - { - "episode": 15392, - "epoch": 0.09222178284262621, - "loss/policy_avg": 0.37361010909080505, - "lr": 9.38586400817996e-06, - "objective/entropy": -219.60760498046875, - "objective/kl": 31.668062210083008, - "objective/non_score_reward": -1.58340322971344, - "objective/rlhf_reward": -4.852660181935191, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 6.965027809143066, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.615234375, - "step": 961, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9987406730651855 - }, - { - "episode": 15408, - "epoch": 0.09231764748175576, - "loss/policy_avg": 0.3272181749343872, - "lr": 9.385224948875256e-06, - "objective/entropy": -200.26370239257812, - "objective/kl": 38.33747100830078, - "objective/non_score_reward": -1.916873574256897, - "objective/rlhf_reward": -5.720082710461552, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.9499969482421875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.583984375, - "step": 962, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9983041286468506 - }, - { - "episode": 15424, - "epoch": 0.0924135121208853, - "loss/policy_avg": 0.02453005313873291, - "lr": 9.384585889570553e-06, - "objective/entropy": -259.0159606933594, - "objective/kl": 32.376686096191406, - "objective/non_score_reward": -1.6188342571258545, - "objective/rlhf_reward": -5.051504810054866, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 11.491250038146973, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.634765625, - "step": 963, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0016989707946777 - }, - { - "episode": 15440, - "epoch": 0.09250937676001486, - "loss/policy_avg": -0.1082817018032074, - "lr": 9.38394683026585e-06, - "objective/entropy": -136.52200317382812, - "objective/kl": 34.37030792236328, - "objective/non_score_reward": -1.718515396118164, - "objective/rlhf_reward": -5.212202077329742, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 5.610563278198242, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.65234375, - "step": 964, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993445873260498 - }, - { - "episode": 15456, - "epoch": 0.0926052413991444, - "loss/policy_avg": 0.3635658025741577, - "lr": 9.383307770961147e-06, - "objective/entropy": -242.04705810546875, - "objective/kl": 26.167871475219727, - "objective/non_score_reward": -1.3083934783935547, - "objective/rlhf_reward": -3.8335740923881527, - "objective/scores": 0.35, - "policy/approxkl_avg": 10.497917175292969, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.505859375, - "step": 965, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998986840248108 - }, - { - "episode": 15472, - "epoch": 0.09270110603827396, - "loss/policy_avg": 0.4805383086204529, - "lr": 9.382668711656443e-06, - "objective/entropy": -130.80931091308594, - "objective/kl": 43.840057373046875, - "objective/non_score_reward": -2.192002773284912, - "objective/rlhf_reward": -6.368011450767517, - "objective/scores": 0.6, - "policy/approxkl_avg": 1.2675271034240723, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.619140625, - "step": 966, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001443862915039 - }, - { - "episode": 15488, - "epoch": 0.0927969706774035, - "loss/policy_avg": 0.9434456825256348, - "lr": 9.382029652351739e-06, - "objective/entropy": -116.85310363769531, - "objective/kl": 55.79869842529297, - "objective/non_score_reward": -2.7899351119995117, - "objective/rlhf_reward": -9.426406518618265, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 2.6991868019104004, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.791015625, - "step": 967, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00309157371521 - }, - { - "episode": 15504, - "epoch": 0.09289283531653306, - "loss/policy_avg": 0.2830507755279541, - "lr": 9.381390593047035e-06, - "objective/entropy": -260.5260925292969, - "objective/kl": 34.16276550292969, - "objective/non_score_reward": -1.7081382274627686, - "objective/rlhf_reward": -5.381954531283721, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 4.792706489562988, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.615234375, - "step": 968, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9985530376434326 - }, - { - "episode": 15520, - "epoch": 0.0929886999556626, - "loss/policy_avg": 0.19756931066513062, - "lr": 9.380751533742332e-06, - "objective/entropy": -234.741455078125, - "objective/kl": 25.891204833984375, - "objective/non_score_reward": -1.2945603132247925, - "objective/rlhf_reward": -3.055534782187019, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 2.262695789337158, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.640625, - "step": 969, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0007271766662598 - }, - { - "episode": 15536, - "epoch": 0.09308456459479215, - "loss/policy_avg": 0.0513734444975853, - "lr": 9.380112474437628e-06, - "objective/entropy": -195.60171508789062, - "objective/kl": 35.50217819213867, - "objective/non_score_reward": -1.775109052658081, - "objective/rlhf_reward": -5.741186105941219, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.6989755630493164, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.607421875, - "step": 970, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 2.0012567043304443 - }, - { - "episode": 15552, - "epoch": 0.0931804292339217, - "loss/policy_avg": 0.1513216644525528, - "lr": 9.379473415132924e-06, - "objective/entropy": -245.57977294921875, - "objective/kl": 23.89773941040039, - "objective/non_score_reward": -1.1948869228363037, - "objective/rlhf_reward": -4.7795480489730835, - "objective/scores": 0.0, - "policy/approxkl_avg": 6.129580020904541, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.69921875, - "step": 971, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000563621520996 - }, - { - "episode": 15568, - "epoch": 0.09327629387305125, - "loss/policy_avg": 0.041885554790496826, - "lr": 9.378834355828221e-06, - "objective/entropy": -261.82769775390625, - "objective/kl": 24.18181037902832, - "objective/non_score_reward": -1.2090904712677002, - "objective/rlhf_reward": -3.457759955016476, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 9.62070369720459, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.533203125, - "step": 972, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9969148635864258 - }, - { - "episode": 15584, - "epoch": 0.0933721585121808, - "loss/policy_avg": 0.012015002779662609, - "lr": 9.378195296523518e-06, - "objective/entropy": -251.767333984375, - "objective/kl": 27.563173294067383, - "objective/non_score_reward": -1.378158688545227, - "objective/rlhf_reward": -3.908514711920338, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 1.0967427492141724, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.634765625, - "step": 973, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0006260871887207 - }, - { - "episode": 15600, - "epoch": 0.09346802315131035, - "loss/policy_avg": -0.31819072365760803, - "lr": 9.377556237218815e-06, - "objective/entropy": -175.70556640625, - "objective/kl": 28.285152435302734, - "objective/non_score_reward": -1.4142576456069946, - "objective/rlhf_reward": -4.052910540167408, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 2.37001371383667, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.505859375, - "step": 974, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9995312690734863 - }, - { - "episode": 15616, - "epoch": 0.09356388779043989, - "loss/policy_avg": 0.6060304641723633, - "lr": 9.37691717791411e-06, - "objective/entropy": -34.974281311035156, - "objective/kl": 35.56610107421875, - "objective/non_score_reward": -1.7783050537109375, - "objective/rlhf_reward": -5.59744867065781, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 6.845120906829834, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.853515625, - "step": 975, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9995267391204834 - }, - { - "episode": 15632, - "epoch": 0.09365975242956945, - "loss/policy_avg": 0.1691616326570511, - "lr": 9.376278118609407e-06, - "objective/entropy": -173.51535034179688, - "objective/kl": 40.181976318359375, - "objective/non_score_reward": -2.009099006652832, - "objective/rlhf_reward": -6.657793619719845, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 0.46673262119293213, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.71875, - "step": 976, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0020508766174316 - }, - { - "episode": 15648, - "epoch": 0.09375561706869899, - "loss/policy_avg": 0.12263473123311996, - "lr": 9.375639059304704e-06, - "objective/entropy": -244.26974487304688, - "objective/kl": 29.573442459106445, - "objective/non_score_reward": -1.4786722660064697, - "objective/rlhf_reward": -4.358429758754328, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 3.748386859893799, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.693359375, - "step": 977, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9990627765655518 - }, - { - "episode": 15664, - "epoch": 0.09385148170782855, - "loss/policy_avg": 1.4557695388793945, - "lr": 9.375000000000001e-06, - "objective/entropy": -133.55853271484375, - "objective/kl": 45.2318229675293, - "objective/non_score_reward": -2.2615909576416016, - "objective/rlhf_reward": -7.530592167171177, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 4.7986626625061035, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.654296875, - "step": 978, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9999971389770508 - }, - { - "episode": 15680, - "epoch": 0.09394734634695809, - "loss/policy_avg": 0.04724450409412384, - "lr": 9.374360940695298e-06, - "objective/entropy": -291.25103759765625, - "objective/kl": 28.29153823852539, - "objective/non_score_reward": -1.4145770072937012, - "objective/rlhf_reward": -3.710896800236638, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.313387393951416, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625, - "step": 979, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9993375539779663 - }, - { - "episode": 15696, - "epoch": 0.09404321098608764, - "loss/policy_avg": 0.2293320745229721, - "lr": 9.373721881390595e-06, - "objective/entropy": -136.44857788085938, - "objective/kl": 38.36898422241211, - "objective/non_score_reward": -1.9184492826461792, - "objective/rlhf_reward": -5.551090779081855, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 2.303453207015991, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.61328125, - "step": 980, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999606966972351 - }, - { - "episode": 15712, - "epoch": 0.0941390756252172, - "loss/policy_avg": 0.16989938914775848, - "lr": 9.37308282208589e-06, - "objective/entropy": -171.79864501953125, - "objective/kl": 32.806495666503906, - "objective/non_score_reward": -1.640324592590332, - "objective/rlhf_reward": -4.613887022213872, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 8.31067180633545, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.603515625, - "step": 981, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9984290599822998 - }, - { - "episode": 15728, - "epoch": 0.09423494026434674, - "loss/policy_avg": 0.7234645485877991, - "lr": 9.372443762781187e-06, - "objective/entropy": -219.93374633789062, - "objective/kl": 26.91738510131836, - "objective/non_score_reward": -1.3458693027496338, - "objective/rlhf_reward": -0.9834773302078244, - "objective/scores": 1.1, - "policy/approxkl_avg": 1.4521507024765015, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.646484375, - "step": 982, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.003150463104248 - }, - { - "episode": 15744, - "epoch": 0.0943308049034763, - "loss/policy_avg": 0.48133015632629395, - "lr": 9.371804703476484e-06, - "objective/entropy": -282.47552490234375, - "objective/kl": 39.29179763793945, - "objective/non_score_reward": -1.9645898342132568, - "objective/rlhf_reward": -6.125026241938272, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 6.169063568115234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.720703125, - "step": 983, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997374415397644 - }, - { - "episode": 15760, - "epoch": 0.09442666954260584, - "loss/policy_avg": 0.1187177523970604, - "lr": 9.37116564417178e-06, - "objective/entropy": -158.33642578125, - "objective/kl": 40.20547103881836, - "objective/non_score_reward": -2.0102736949920654, - "objective/rlhf_reward": -6.69945864966455, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 3.5165886878967285, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62109375, - "step": 984, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9981753826141357 - }, - { - "episode": 15776, - "epoch": 0.0945225341817354, - "loss/policy_avg": 0.16677279770374298, - "lr": 9.370526584867077e-06, - "objective/entropy": -162.21728515625, - "objective/kl": 33.61964797973633, - "objective/non_score_reward": -1.6809823513031006, - "objective/rlhf_reward": -5.323929286003112, - "objective/scores": 0.35, - "policy/approxkl_avg": 5.913999557495117, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.734375, - "step": 985, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9968831539154053 - }, - { - "episode": 15792, - "epoch": 0.09461839882086494, - "loss/policy_avg": 0.22338780760765076, - "lr": 9.369887525562373e-06, - "objective/entropy": -191.39588928222656, - "objective/kl": 50.39151382446289, - "objective/non_score_reward": -2.519575595855713, - "objective/rlhf_reward": -8.416443472326385, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 45.444732666015625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.755859375, - "step": 986, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998542308807373 - }, - { - "episode": 15808, - "epoch": 0.0947142634599945, - "loss/policy_avg": 0.37791919708251953, - "lr": 9.36924846625767e-06, - "objective/entropy": -270.806396484375, - "objective/kl": 29.205078125, - "objective/non_score_reward": -1.4602539539337158, - "objective/rlhf_reward": -5.841015696525574, - "objective/scores": 0.0, - "policy/approxkl_avg": 8.895004272460938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6875, - "step": 987, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9979805946350098 - }, - { - "episode": 15824, - "epoch": 0.09481012809912404, - "loss/policy_avg": 0.7314577102661133, - "lr": 9.368609406952966e-06, - "objective/entropy": -174.33633422851562, - "objective/kl": 41.00555419921875, - "objective/non_score_reward": -2.0502774715423584, - "objective/rlhf_reward": -6.77727790613946, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 4.151052474975586, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.59375, - "step": 988, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998262882232666 - }, - { - "episode": 15840, - "epoch": 0.09490599273825359, - "loss/policy_avg": 0.1200692355632782, - "lr": 9.367970347648263e-06, - "objective/entropy": -259.9232177734375, - "objective/kl": 32.56160354614258, - "objective/non_score_reward": -1.628080129623413, - "objective/rlhf_reward": -5.112320518493652, - "objective/scores": 0.35, - "policy/approxkl_avg": 3.3896703720092773, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.615234375, - "step": 989, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0001492500305176 - }, - { - "episode": 15856, - "epoch": 0.09500185737738313, - "loss/policy_avg": 0.7871278524398804, - "lr": 9.367331288343558e-06, - "objective/entropy": -162.90664672851562, - "objective/kl": 37.55353927612305, - "objective/non_score_reward": -1.8776767253875732, - "objective/rlhf_reward": -6.086875279148188, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 24.93891716003418, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7890625, - "step": 990, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9959328174591064 - }, - { - "episode": 15872, - "epoch": 0.09509772201651269, - "loss/policy_avg": -0.12516134977340698, - "lr": 9.366692229038855e-06, - "objective/entropy": -238.83116149902344, - "objective/kl": 37.03616714477539, - "objective/non_score_reward": -1.8518084287643433, - "objective/rlhf_reward": -6.047983967994137, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 15.576482772827148, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.681640625, - "step": 991, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985809326171875 - }, - { - "episode": 15888, - "epoch": 0.09519358665564223, - "loss/policy_avg": -0.04968651384115219, - "lr": 9.366053169734152e-06, - "objective/entropy": -183.43231201171875, - "objective/kl": 35.40851593017578, - "objective/non_score_reward": -1.77042555809021, - "objective/rlhf_reward": -5.756189737349672, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 0.5774535536766052, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.583984375, - "step": 992, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.002680778503418 - }, - { - "episode": 15904, - "epoch": 0.09528945129477179, - "loss/policy_avg": 0.009859908372163773, - "lr": 9.365414110429449e-06, - "objective/entropy": -14.670166015625, - "objective/kl": 53.70581817626953, - "objective/non_score_reward": -2.685290813446045, - "objective/rlhf_reward": -8.3411630153656, - "objective/scores": 0.6, - "policy/approxkl_avg": 1.3184102773666382, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.681640625, - "step": 993, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0005533695220947 - }, - { - "episode": 15920, - "epoch": 0.09538531593390133, - "loss/policy_avg": 0.3695295453071594, - "lr": 9.364775051124744e-06, - "objective/entropy": -288.468505859375, - "objective/kl": 32.96984100341797, - "objective/non_score_reward": -1.6484923362731934, - "objective/rlhf_reward": -5.0781975624882545, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 3.1653892993927, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.642578125, - "step": 994, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999734878540039 - }, - { - "episode": 15936, - "epoch": 0.09548118057303089, - "loss/policy_avg": 0.3992432951927185, - "lr": 9.364135991820041e-06, - "objective/entropy": -231.646728515625, - "objective/kl": 34.67195510864258, - "objective/non_score_reward": -1.733597755432129, - "objective/rlhf_reward": -5.510559280117121, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 19.767539978027344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.849609375, - "step": 995, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9987037181854248 - }, - { - "episode": 15952, - "epoch": 0.09557704521216043, - "loss/policy_avg": 0.03356311097741127, - "lr": 9.363496932515338e-06, - "objective/entropy": -210.72410583496094, - "objective/kl": 27.1010799407959, - "objective/non_score_reward": -1.3550540208816528, - "objective/rlhf_reward": -3.595387215885233, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 1.0958271026611328, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7265625, - "step": 996, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9994826316833496 - }, - { - "episode": 15968, - "epoch": 0.09567290985128998, - "loss/policy_avg": 1.1218140125274658, - "lr": 9.362857873210635e-06, - "objective/entropy": -71.63316345214844, - "objective/kl": 40.19666290283203, - "objective/non_score_reward": -2.009833335876465, - "objective/rlhf_reward": -8.03933310508728, - "objective/scores": 0.0, - "policy/approxkl_avg": 3.4838500022888184, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.66015625, - "step": 997, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9991953372955322 - }, - { - "episode": 15984, - "epoch": 0.09576877449041953, - "loss/policy_avg": 0.23440885543823242, - "lr": 9.362218813905932e-06, - "objective/entropy": -217.69229125976562, - "objective/kl": 26.445728302001953, - "objective/non_score_reward": -1.3222863674163818, - "objective/rlhf_reward": -3.773374044688877, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 11.445338249206543, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.623046875, - "step": 998, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9970027208328247 - }, - { - "episode": 16000, - "epoch": 0.09586463912954908, - "loss/policy_avg": -0.3169388175010681, - "lr": 9.361579754601227e-06, - "objective/entropy": -116.28077697753906, - "objective/kl": 44.722564697265625, - "objective/non_score_reward": -2.236128091812134, - "objective/rlhf_reward": -6.997101019101079, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 4.412589073181152, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.734375, - "step": 999, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000357151031494 - }, - { - "episode": 16016, - "epoch": 0.09596050376867862, - "loss/policy_avg": 0.49583154916763306, - "lr": 9.360940695296524e-06, - "objective/entropy": -255.0631561279297, - "objective/kl": 37.207157135009766, - "objective/non_score_reward": -1.8603577613830566, - "objective/rlhf_reward": -4.517711792827818, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 4.7410383224487305, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484375, - "step": 1000, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9983508586883545 - }, - { - "episode": 16032, - "epoch": 0.09605636840780818, - "loss/policy_avg": 0.2908029556274414, - "lr": 9.36030163599182e-06, - "objective/entropy": -158.05224609375, - "objective/kl": 43.559486389160156, - "objective/non_score_reward": -2.177974224090576, - "objective/rlhf_reward": -7.386384520560426, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 4.997418403625488, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.49609375, - "step": 1001, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000509738922119 - }, - { - "episode": 16048, - "epoch": 0.09615223304693772, - "loss/policy_avg": 0.0880887508392334, - "lr": 9.359662576687117e-06, - "objective/entropy": -159.17636108398438, - "objective/kl": 32.491432189941406, - "objective/non_score_reward": -1.6245718002319336, - "objective/rlhf_reward": -5.1196852708734095, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 23.146318435668945, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.615234375, - "step": 1002, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9992117881774902 - }, - { - "episode": 16064, - "epoch": 0.09624809768606728, - "loss/policy_avg": -0.1608562171459198, - "lr": 9.359023517382414e-06, - "objective/entropy": 31.09607696533203, - "objective/kl": 48.06477355957031, - "objective/non_score_reward": -2.4032387733459473, - "objective/rlhf_reward": -7.78812610653312, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 4.198085784912109, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.71484375, - "step": 1003, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001403331756592 - }, - { - "episode": 16080, - "epoch": 0.09634396232519682, - "loss/policy_avg": -0.09791003167629242, - "lr": 9.358384458077711e-06, - "objective/entropy": -204.42648315429688, - "objective/kl": 32.63614273071289, - "objective/non_score_reward": -1.6318070888519287, - "objective/rlhf_reward": -5.127228474617004, - "objective/scores": 0.35, - "policy/approxkl_avg": 3.644939422607422, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.623046875, - "step": 1004, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0014867782592773 - }, - { - "episode": 16096, - "epoch": 0.09643982696432638, - "loss/policy_avg": 0.3904947340488434, - "lr": 9.357745398773006e-06, - "objective/entropy": -230.99227905273438, - "objective/kl": 26.775943756103516, - "objective/non_score_reward": -1.3387972116470337, - "objective/rlhf_reward": -3.955188965797424, - "objective/scores": 0.35, - "policy/approxkl_avg": 6.282003402709961, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.859375, - "step": 1005, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 12, - "val/ratio": 1.999232292175293 - }, - { - "episode": 16112, - "epoch": 0.09653569160345592, - "loss/policy_avg": 0.7725321054458618, - "lr": 9.357106339468303e-06, - "objective/entropy": -164.7260284423828, - "objective/kl": 36.20423889160156, - "objective/non_score_reward": -1.8102120161056519, - "objective/rlhf_reward": -5.416019315990518, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 2.2319459915161133, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69921875, - "step": 1006, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000358819961548 - }, - { - "episode": 16128, - "epoch": 0.09663155624258547, - "loss/policy_avg": 0.4622969627380371, - "lr": 9.3564672801636e-06, - "objective/entropy": -133.11448669433594, - "objective/kl": 46.60032272338867, - "objective/non_score_reward": -2.3300158977508545, - "objective/rlhf_reward": -7.920063829421997, - "objective/scores": 0.35, - "policy/approxkl_avg": 4.947162628173828, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.63671875, - "step": 1007, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9968760013580322 - }, - { - "episode": 16144, - "epoch": 0.09672742088171501, - "loss/policy_avg": 0.28032606840133667, - "lr": 9.355828220858897e-06, - "objective/entropy": -185.09371948242188, - "objective/kl": 38.272674560546875, - "objective/non_score_reward": -1.9136335849761963, - "objective/rlhf_reward": -6.275932290641171, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 5.263652801513672, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.650390625, - "step": 1008, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0010600090026855 - }, - { - "episode": 16160, - "epoch": 0.09682328552084457, - "loss/policy_avg": 0.18294349312782288, - "lr": 9.355189161554194e-06, - "objective/entropy": -147.19964599609375, - "objective/kl": 32.98589324951172, - "objective/non_score_reward": -1.6492946147918701, - "objective/rlhf_reward": -4.935319071233856, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 5.73829460144043, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.67578125, - "step": 1009, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9979850053787231 - }, - { - "episode": 16176, - "epoch": 0.09691915015997411, - "loss/policy_avg": -0.004333788529038429, - "lr": 9.35455010224949e-06, - "objective/entropy": -197.96774291992188, - "objective/kl": 37.333194732666016, - "objective/non_score_reward": -1.8666596412658691, - "objective/rlhf_reward": -4.5429196699869365, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 1.3020401000976562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8203125, - "step": 1010, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9998223781585693 - }, - { - "episode": 16192, - "epoch": 0.09701501479910367, - "loss/policy_avg": -0.052329957485198975, - "lr": 9.353911042944786e-06, - "objective/entropy": -197.37957763671875, - "objective/kl": 30.12477684020996, - "objective/non_score_reward": -1.5062386989593506, - "objective/rlhf_reward": -4.077543805317815, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 1.2824825048446655, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.619140625, - "step": 1011, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000406265258789 - }, - { - "episode": 16208, - "epoch": 0.09711087943823321, - "loss/policy_avg": -0.058374106884002686, - "lr": 9.353271983640083e-06, - "objective/entropy": -196.46224975585938, - "objective/kl": 28.03622817993164, - "objective/non_score_reward": -1.4018113613128662, - "objective/rlhf_reward": -4.126293065960764, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 1.1209321022033691, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.57421875, - "step": 1012, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.001298427581787 - }, - { - "episode": 16224, - "epoch": 0.09720674407736277, - "loss/policy_avg": 0.7006990909576416, - "lr": 9.352632924335378e-06, - "objective/entropy": -285.3323974609375, - "objective/kl": 28.77189826965332, - "objective/non_score_reward": -1.4385948181152344, - "objective/rlhf_reward": -4.4288665390311905, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 2.1271591186523438, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.693359375, - "step": 1013, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.002638339996338 - }, - { - "episode": 16240, - "epoch": 0.09730260871649231, - "loss/policy_avg": 0.07051658630371094, - "lr": 9.351993865030675e-06, - "objective/entropy": -198.2432098388672, - "objective/kl": 24.557363510131836, - "objective/non_score_reward": -1.2278680801391602, - "objective/rlhf_reward": -3.5859598255454728, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 29.07752227783203, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6875, - "step": 1014, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990439414978027 - }, - { - "episode": 16256, - "epoch": 0.09739847335562186, - "loss/policy_avg": -0.5166081190109253, - "lr": 9.351354805725972e-06, - "objective/entropy": -63.29674530029297, - "objective/kl": 38.85722351074219, - "objective/non_score_reward": -1.9428613185882568, - "objective/rlhf_reward": -6.167325112883168, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 6.718572616577148, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.61328125, - "step": 1015, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0040817260742188 - }, - { - "episode": 16272, - "epoch": 0.0974943379947514, - "loss/policy_avg": 0.462972491979599, - "lr": 9.350715746421269e-06, - "objective/entropy": -214.515380859375, - "objective/kl": 33.796573638916016, - "objective/non_score_reward": -1.689828634262085, - "objective/rlhf_reward": -5.203055350986078, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 9.360330581665039, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.79296875, - "step": 1016, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.996612310409546 - }, - { - "episode": 16288, - "epoch": 0.09759020263388096, - "loss/policy_avg": -0.1453489363193512, - "lr": 9.350076687116566e-06, - "objective/entropy": -235.11651611328125, - "objective/kl": 33.26921081542969, - "objective/non_score_reward": -1.663460612297058, - "objective/rlhf_reward": -4.7064312202500656, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 4.160917282104492, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.703125, - "step": 1017, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.000718593597412 - }, - { - "episode": 16304, - "epoch": 0.0976860672730105, - "loss/policy_avg": 0.19937211275100708, - "lr": 9.34943762781186e-06, - "objective/entropy": -255.98963928222656, - "objective/kl": 37.99565887451172, - "objective/non_score_reward": -1.8997828960418701, - "objective/rlhf_reward": -6.257495990305571, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 18.184246063232422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58203125, - "step": 1018, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998490810394287 - }, - { - "episode": 16320, - "epoch": 0.09778193191214006, - "loss/policy_avg": -0.04537857323884964, - "lr": 9.348798568507158e-06, - "objective/entropy": -208.28750610351562, - "objective/kl": 29.751262664794922, - "objective/non_score_reward": -1.4875633716583252, - "objective/rlhf_reward": -4.002841900067265, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 35.739540100097656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.787109375, - "step": 1019, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9980144500732422 - }, - { - "episode": 16336, - "epoch": 0.0978777965512696, - "loss/policy_avg": 0.15292394161224365, - "lr": 9.348159509202455e-06, - "objective/entropy": -234.64700317382812, - "objective/kl": 29.85890769958496, - "objective/non_score_reward": -1.4929454326629639, - "objective/rlhf_reward": -4.3676616287866405, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 2.772150993347168, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.677734375, - "step": 1020, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0000951290130615 - }, - { - "episode": 16352, - "epoch": 0.09797366119039916, - "loss/policy_avg": 0.3814322352409363, - "lr": 9.347520449897751e-06, - "objective/entropy": -124.42337799072266, - "objective/kl": 36.442901611328125, - "objective/non_score_reward": -1.8221449851989746, - "objective/rlhf_reward": -5.8885798215866085, - "objective/scores": 0.35, - "policy/approxkl_avg": 5.533565998077393, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.68359375, - "step": 1021, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9961100816726685 - }, - { - "episode": 16368, - "epoch": 0.0980695258295287, - "loss/policy_avg": 0.4999345541000366, - "lr": 9.346881390593048e-06, - "objective/entropy": -192.25704956054688, - "objective/kl": 24.090442657470703, - "objective/non_score_reward": -1.2045221328735352, - "objective/rlhf_reward": -3.3023168084942665, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 16.40319061279297, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.556640625, - "step": 1022, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997682809829712 - }, - { - "episode": 16384, - "epoch": 0.09816539046865826, - "loss/policy_avg": 0.22556136548519135, - "lr": 9.346242331288345e-06, - "objective/entropy": -280.6515197753906, - "objective/kl": 30.555099487304688, - "objective/non_score_reward": -1.5277550220489502, - "objective/rlhf_reward": -4.506900105539875, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 0.8321056365966797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.650390625, - "step": 1023, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9995068311691284 - }, - { - "episode": 16400, - "epoch": 0.0982612551077878, - "loss/policy_avg": 0.1927730292081833, - "lr": 9.34560327198364e-06, - "objective/entropy": -114.62777709960938, - "objective/kl": 41.009063720703125, - "objective/non_score_reward": -2.0504534244537354, - "objective/rlhf_reward": -6.5399538926488034, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 12.904714584350586, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.55859375, - "step": 1024, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9970085620880127 - }, - { - "episode": 16416, - "epoch": 0.09835711974691735, - "loss/policy_avg": 0.004962563514709473, - "lr": 9.344964212678937e-06, - "objective/entropy": -175.405029296875, - "objective/kl": 32.8451042175293, - "objective/non_score_reward": -1.6422550678253174, - "objective/rlhf_reward": -4.835687295595805, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 2.176795244216919, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.763671875, - "step": 1025, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9991182088851929 - }, - { - "episode": 16432, - "epoch": 0.0984529843860469, - "loss/policy_avg": 0.3356385827064514, - "lr": 9.344325153374234e-06, - "objective/entropy": -179.56375122070312, - "objective/kl": 44.559669494628906, - "objective/non_score_reward": -2.2279834747314453, - "objective/rlhf_reward": -7.1786006848017365, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 1.5793884992599487, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6875, - "step": 1026, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0000429153442383 - }, - { - "episode": 16448, - "epoch": 0.09854884902517645, - "loss/policy_avg": 0.062264252454042435, - "lr": 9.343686094069531e-06, - "objective/entropy": -124.67230224609375, - "objective/kl": 32.24571228027344, - "objective/non_score_reward": -1.6122857332229614, - "objective/rlhf_reward": -4.049142932891845, - "objective/scores": 0.6, - "policy/approxkl_avg": 4.209178924560547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.68359375, - "step": 1027, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998481273651123 - }, - { - "episode": 16464, - "epoch": 0.098644713664306, - "loss/policy_avg": 0.27750128507614136, - "lr": 9.343047034764828e-06, - "objective/entropy": -280.3656005859375, - "objective/kl": 36.0235710144043, - "objective/non_score_reward": -1.8011784553527832, - "objective/rlhf_reward": -5.863078525572448, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 9.040508270263672, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.705078125, - "step": 1028, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9958012104034424 - }, - { - "episode": 16480, - "epoch": 0.09874057830343555, - "loss/policy_avg": -0.08439403772354126, - "lr": 9.342407975460123e-06, - "objective/entropy": -159.83497619628906, - "objective/kl": 42.88642120361328, - "objective/non_score_reward": -2.1443209648132324, - "objective/rlhf_reward": -7.1266858383134455, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 6.443965911865234, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.701171875, - "step": 1029, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.003908157348633 - }, - { - "episode": 16496, - "epoch": 0.09883644294256509, - "loss/policy_avg": 0.6222244501113892, - "lr": 9.34176891615542e-06, - "objective/entropy": -148.41481018066406, - "objective/kl": 38.87040710449219, - "objective/non_score_reward": -1.943520188331604, - "objective/rlhf_reward": -6.258308732303318, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 28.20026397705078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.87109375, - "step": 1030, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9953274726867676 - }, - { - "episode": 16512, - "epoch": 0.09893230758169465, - "loss/policy_avg": 0.04845335707068443, - "lr": 9.341129856850717e-06, - "objective/entropy": -236.35935974121094, - "objective/kl": 28.790306091308594, - "objective/non_score_reward": -1.4395153522491455, - "objective/rlhf_reward": -3.8106498820351913, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 6.143889427185059, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.642578125, - "step": 1031, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9981472492218018 - }, - { - "episode": 16528, - "epoch": 0.09902817222082419, - "loss/policy_avg": 0.1800106167793274, - "lr": 9.340490797546014e-06, - "objective/entropy": -234.52456665039062, - "objective/kl": 38.6103515625, - "objective/non_score_reward": -1.9305176734924316, - "objective/rlhf_reward": -7.722070813179016, - "objective/scores": 0.0, - "policy/approxkl_avg": 2.025315761566162, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.58984375, - "step": 1032, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0001583099365234 - }, - { - "episode": 16544, - "epoch": 0.09912403685995375, - "loss/policy_avg": 0.1573864221572876, - "lr": 9.33985173824131e-06, - "objective/entropy": -206.30435180664062, - "objective/kl": 29.538883209228516, - "objective/non_score_reward": -1.4769442081451416, - "objective/rlhf_reward": -3.507777070999145, - "objective/scores": 0.6, - "policy/approxkl_avg": 3.956908702850342, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7734375, - "step": 1033, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9989354610443115 - }, - { - "episode": 16560, - "epoch": 0.09921990149908329, - "loss/policy_avg": 0.3316153883934021, - "lr": 9.339212678936606e-06, - "objective/entropy": -158.2957763671875, - "objective/kl": 27.869169235229492, - "objective/non_score_reward": -1.393458366394043, - "objective/rlhf_reward": -5.5738338232040405, - "objective/scores": 0.0, - "policy/approxkl_avg": 2.423194169998169, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.60546875, - "step": 1034, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998276710510254 - }, - { - "episode": 16576, - "epoch": 0.09931576613821284, - "loss/policy_avg": 0.540399432182312, - "lr": 9.338573619631903e-06, - "objective/entropy": -278.6914367675781, - "objective/kl": 24.516807556152344, - "objective/non_score_reward": -1.2258403301239014, - "objective/rlhf_reward": -3.4795294596749224, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 3.2752022743225098, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015625, - "step": 1035, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9993618726730347 - }, - { - "episode": 16592, - "epoch": 0.09941163077734239, - "loss/policy_avg": 0.17466121912002563, - "lr": 9.3379345603272e-06, - "objective/entropy": -273.7776794433594, - "objective/kl": 35.438560485839844, - "objective/non_score_reward": -1.7719281911849976, - "objective/rlhf_reward": -5.571941101344761, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 19.215896606445312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.650390625, - "step": 1036, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9990078210830688 - }, - { - "episode": 16608, - "epoch": 0.09950749541647194, - "loss/policy_avg": 0.6281372308731079, - "lr": 9.337295501022495e-06, - "objective/entropy": -54.27313232421875, - "objective/kl": 45.946815490722656, - "objective/non_score_reward": -2.2973408699035645, - "objective/rlhf_reward": -7.6735920546376075, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 10.886024475097656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.931640625, - "step": 1037, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9997375011444092 - }, - { - "episode": 16624, - "epoch": 0.0996033600556015, - "loss/policy_avg": 0.5044693350791931, - "lr": 9.336656441717792e-06, - "objective/entropy": -51.8316650390625, - "objective/kl": 34.80516815185547, - "objective/non_score_reward": -1.7402584552764893, - "objective/rlhf_reward": -5.5104356213525385, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 3.0943219661712646, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.837890625, - "step": 1038, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0017142295837402 - }, - { - "episode": 16640, - "epoch": 0.09969922469473104, - "loss/policy_avg": 0.050643354654312134, - "lr": 9.336017382413088e-06, - "objective/entropy": -289.61761474609375, - "objective/kl": 35.579490661621094, - "objective/non_score_reward": -1.7789745330810547, - "objective/rlhf_reward": -5.559638767448023, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 29.854312896728516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.71484375, - "step": 1039, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.99898362159729 - }, - { - "episode": 16656, - "epoch": 0.0997950893338606, - "loss/policy_avg": 0.66060471534729, - "lr": 9.335378323108385e-06, - "objective/entropy": -253.1927490234375, - "objective/kl": 31.551429748535156, - "objective/non_score_reward": -1.5775716304779053, - "objective/rlhf_reward": -4.9847734308540055, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 74.64668273925781, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.619140625, - "step": 1040, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998822569847107 - }, - { - "episode": 16672, - "epoch": 0.09989095397299014, - "loss/policy_avg": 0.9751706123352051, - "lr": 9.334739263803682e-06, - "objective/entropy": -148.04188537597656, - "objective/kl": 32.937591552734375, - "objective/non_score_reward": -1.6468796730041504, - "objective/rlhf_reward": -6.587518572807312, - "objective/scores": 0.0, - "policy/approxkl_avg": 6.001709461212158, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.728515625, - "step": 1041, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000730037689209 - }, - { - "episode": 16688, - "epoch": 0.0999868186121197, - "loss/policy_avg": 0.37717461585998535, - "lr": 9.334100204498977e-06, - "objective/entropy": -37.40810012817383, - "objective/kl": 31.557598114013672, - "objective/non_score_reward": -1.5778799057006836, - "objective/rlhf_reward": -4.364108632283147, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 75.23666381835938, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.60546875, - "step": 1042, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990663528442383 - }, - { - "episode": 16704, - "epoch": 0.10008268325124924, - "loss/policy_avg": 0.21707114577293396, - "lr": 9.333461145194274e-06, - "objective/entropy": -185.875732421875, - "objective/kl": 31.279882431030273, - "objective/non_score_reward": -1.563994288444519, - "objective/rlhf_reward": -4.133270683065925, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 13.107833862304688, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.703125, - "step": 1043, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998514175415039 - }, - { - "episode": 16720, - "epoch": 0.10017854789037879, - "loss/policy_avg": 0.19673524796962738, - "lr": 9.332822085889571e-06, - "objective/entropy": -271.62109375, - "objective/kl": 31.95672607421875, - "objective/non_score_reward": -1.5978362560272217, - "objective/rlhf_reward": -1.9913449048995968, - "objective/scores": 1.1, - "policy/approxkl_avg": 8.022303581237793, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.669921875, - "step": 1044, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9962241649627686 - }, - { - "episode": 16736, - "epoch": 0.10027441252950833, - "loss/policy_avg": 0.36011672019958496, - "lr": 9.332183026584868e-06, - "objective/entropy": -189.5650634765625, - "objective/kl": 27.331592559814453, - "objective/non_score_reward": -1.3665797710418701, - "objective/rlhf_reward": -3.34361249424604, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 12.111129760742188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7890625, - "step": 1045, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9945616722106934 - }, - { - "episode": 16752, - "epoch": 0.10037027716863789, - "loss/policy_avg": 0.24991941452026367, - "lr": 9.331543967280165e-06, - "objective/entropy": -269.1661682128906, - "objective/kl": 29.150144577026367, - "objective/non_score_reward": -1.4575071334838867, - "objective/rlhf_reward": -4.314256751331028, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 39.73731231689453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.591796875, - "step": 1046, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9968986511230469 - }, - { - "episode": 16768, - "epoch": 0.10046614180776743, - "loss/policy_avg": 0.018538065254688263, - "lr": 9.330904907975462e-06, - "objective/entropy": -128.5980224609375, - "objective/kl": 42.25013732910156, - "objective/non_score_reward": -2.112506866455078, - "objective/rlhf_reward": -6.050027823448181, - "objective/scores": 0.6, - "policy/approxkl_avg": 1.4199237823486328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6875, - "step": 1047, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000657320022583 - }, - { - "episode": 16784, - "epoch": 0.10056200644689699, - "loss/policy_avg": 0.35199424624443054, - "lr": 9.330265848670757e-06, - "objective/entropy": -282.9249572753906, - "objective/kl": 34.62944793701172, - "objective/non_score_reward": -1.7314722537994385, - "objective/rlhf_reward": -4.002170358539793, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 2.832670211791992, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.60546875, - "step": 1048, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9977006912231445 - }, - { - "episode": 16800, - "epoch": 0.10065787108602653, - "loss/policy_avg": -0.12381379306316376, - "lr": 9.329626789366054e-06, - "objective/entropy": -177.63133239746094, - "objective/kl": 29.458477020263672, - "objective/non_score_reward": -1.472923755645752, - "objective/rlhf_reward": -4.229835753858673, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.280195713043213, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.638671875, - "step": 1049, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.002951145172119 - }, - { - "episode": 16816, - "epoch": 0.10075373572515609, - "loss/policy_avg": 0.06033053621649742, - "lr": 9.32898773006135e-06, - "objective/entropy": -229.76272583007812, - "objective/kl": 25.89266586303711, - "objective/non_score_reward": -1.294633388519287, - "objective/rlhf_reward": -3.7547014548378863, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 3.814189910888672, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6640625, - "step": 1050, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9981303215026855 - }, - { - "episode": 16832, - "epoch": 0.10084960036428563, - "loss/policy_avg": -0.14406134188175201, - "lr": 9.328348670756648e-06, - "objective/entropy": -121.60057067871094, - "objective/kl": 34.72946548461914, - "objective/non_score_reward": -1.7364733219146729, - "objective/rlhf_reward": -5.522061069210139, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 4.643096446990967, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.609375, - "step": 1051, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0071685314178467 - }, - { - "episode": 16848, - "epoch": 0.10094546500341518, - "loss/policy_avg": 0.3516131639480591, - "lr": 9.327709611451944e-06, - "objective/entropy": -290.5709228515625, - "objective/kl": 32.417964935302734, - "objective/non_score_reward": -1.6208982467651367, - "objective/rlhf_reward": -2.0835929870605465, - "objective/scores": 1.1, - "policy/approxkl_avg": 106.68559265136719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.578125, - "step": 1052, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9990932941436768 - }, - { - "episode": 16864, - "epoch": 0.10104132964254472, - "loss/policy_avg": -0.2397887408733368, - "lr": 9.32707055214724e-06, - "objective/entropy": -130.25076293945312, - "objective/kl": 37.00995635986328, - "objective/non_score_reward": -1.850497841835022, - "objective/rlhf_reward": -5.978159268100825, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 5.758305072784424, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.6875, - "step": 1053, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0002975463867188 - }, - { - "episode": 16880, - "epoch": 0.10113719428167428, - "loss/policy_avg": 0.07710824906826019, - "lr": 9.326431492842537e-06, - "objective/entropy": -265.08575439453125, - "objective/kl": 30.579792022705078, - "objective/non_score_reward": -1.528989553451538, - "objective/rlhf_reward": -3.192239318729612, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 1.1249363422393799, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.720703125, - "step": 1054, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.99985933303833 - }, - { - "episode": 16896, - "epoch": 0.10123305892080382, - "loss/policy_avg": 0.5552304983139038, - "lr": 9.325792433537833e-06, - "objective/entropy": -214.11900329589844, - "objective/kl": 49.237579345703125, - "objective/non_score_reward": -2.461879014968872, - "objective/rlhf_reward": -8.447516059875488, - "objective/scores": 0.35, - "policy/approxkl_avg": 28.872817993164062, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.66796875, - "step": 1055, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9940861463546753 - }, - { - "episode": 16912, - "epoch": 0.10132892355993338, - "loss/policy_avg": 0.4369004964828491, - "lr": 9.325153374233129e-06, - "objective/entropy": -218.92349243164062, - "objective/kl": 31.91252899169922, - "objective/non_score_reward": -1.5956264734268188, - "objective/rlhf_reward": -4.435094545559819, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 20.476360321044922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.646484375, - "step": 1056, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.99814772605896 - }, - { - "episode": 16928, - "epoch": 0.10142478819906292, - "loss/policy_avg": 0.11664807796478271, - "lr": 9.324514314928425e-06, - "objective/entropy": -241.1952667236328, - "objective/kl": 33.52198791503906, - "objective/non_score_reward": -1.6760993003845215, - "objective/rlhf_reward": -4.971064166227976, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 4.343099594116211, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.822265625, - "step": 1057, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9999535083770752 - }, - { - "episode": 16944, - "epoch": 0.10152065283819248, - "loss/policy_avg": -0.033681720495224, - "lr": 9.323875255623722e-06, - "objective/entropy": -244.3253173828125, - "objective/kl": 26.85427474975586, - "objective/non_score_reward": -1.3427138328552246, - "objective/rlhf_reward": -3.7667349911371044, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 5.019390106201172, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.677734375, - "step": 1058, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000871419906616 - }, - { - "episode": 16960, - "epoch": 0.10161651747732202, - "loss/policy_avg": -0.006691465154290199, - "lr": 9.32323619631902e-06, - "objective/entropy": -193.07406616210938, - "objective/kl": 22.30344009399414, - "objective/non_score_reward": -1.115172028541565, - "objective/rlhf_reward": -2.9044288088947083, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 14.373213768005371, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.734375, - "step": 1059, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9997581243515015 - }, - { - "episode": 16976, - "epoch": 0.10171238211645157, - "loss/policy_avg": 0.03293745219707489, - "lr": 9.322597137014316e-06, - "objective/entropy": -276.60870361328125, - "objective/kl": 35.162376403808594, - "objective/non_score_reward": -1.7581188678741455, - "objective/rlhf_reward": -5.632475113868713, - "objective/scores": 0.35, - "policy/approxkl_avg": 6.92661714553833, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6328125, - "step": 1060, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9994102716445923 - }, - { - "episode": 16992, - "epoch": 0.10180824675558112, - "loss/policy_avg": 0.009452302008867264, - "lr": 9.321958077709611e-06, - "objective/entropy": -167.18348693847656, - "objective/kl": 33.525054931640625, - "objective/non_score_reward": -1.676252841949463, - "objective/rlhf_reward": -5.043151860654937, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 0.7187179923057556, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.98828125, - "step": 1061, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.002134323120117 - }, - { - "episode": 17008, - "epoch": 0.10190411139471067, - "loss/policy_avg": 0.2391328066587448, - "lr": 9.321319018404908e-06, - "objective/entropy": -251.56936645507812, - "objective/kl": 31.454349517822266, - "objective/non_score_reward": -1.5727174282073975, - "objective/rlhf_reward": -4.466040725978922, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 2.082510471343994, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.634765625, - "step": 1062, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0036981105804443 - }, - { - "episode": 17024, - "epoch": 0.10199997603384021, - "loss/policy_avg": 0.2995299696922302, - "lr": 9.320679959100205e-06, - "objective/entropy": -240.9496307373047, - "objective/kl": 36.60504913330078, - "objective/non_score_reward": -1.8302524089813232, - "objective/rlhf_reward": -5.716889891687947, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 1.9207489490509033, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.60546875, - "step": 1063, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9988347291946411 - }, - { - "episode": 17040, - "epoch": 0.10209584067296977, - "loss/policy_avg": 0.14015616476535797, - "lr": 9.320040899795502e-06, - "objective/entropy": -262.3077392578125, - "objective/kl": 22.77030372619629, - "objective/non_score_reward": -1.1385152339935303, - "objective/rlhf_reward": -3.2124252229029233, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 1.2123262882232666, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.734375, - "step": 1064, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9995969533920288 - }, - { - "episode": 17056, - "epoch": 0.10219170531209931, - "loss/policy_avg": 0.14029760658740997, - "lr": 9.319401840490799e-06, - "objective/entropy": -303.0190734863281, - "objective/kl": 25.82904815673828, - "objective/non_score_reward": -1.2914522886276245, - "objective/rlhf_reward": -3.609549908843592, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 2.374150276184082, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.634765625, - "step": 1065, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9978749752044678 - }, - { - "episode": 17072, - "epoch": 0.10228756995122887, - "loss/policy_avg": 0.3477242588996887, - "lr": 9.318762781186094e-06, - "objective/entropy": -70.10704040527344, - "objective/kl": 36.12684631347656, - "objective/non_score_reward": -1.806342363357544, - "objective/rlhf_reward": -5.883733919172911, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 4.796685695648193, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.560546875, - "step": 1066, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998852014541626 - }, - { - "episode": 17088, - "epoch": 0.10238343459035841, - "loss/policy_avg": 0.07034695893526077, - "lr": 9.318123721881391e-06, - "objective/entropy": -297.8764343261719, - "objective/kl": 27.875173568725586, - "objective/non_score_reward": -1.3937586545944214, - "objective/rlhf_reward": -5.575034737586975, - "objective/scores": 0.0, - "policy/approxkl_avg": 2.2109901905059814, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58203125, - "step": 1067, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.002016305923462 - }, - { - "episode": 17104, - "epoch": 0.10247929922948797, - "loss/policy_avg": 1.4407649040222168, - "lr": 9.317484662576688e-06, - "objective/entropy": -241.74539184570312, - "objective/kl": 19.868005752563477, - "objective/non_score_reward": -0.9934003353118896, - "objective/rlhf_reward": -2.369481120173054, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 1.7092839479446411, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.673828125, - "step": 1068, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.004781723022461 - }, - { - "episode": 17120, - "epoch": 0.10257516386861751, - "loss/policy_avg": 0.2252398431301117, - "lr": 9.316845603271985e-06, - "objective/entropy": -238.30023193359375, - "objective/kl": 36.790252685546875, - "objective/non_score_reward": -1.839512586593628, - "objective/rlhf_reward": -5.410638998227055, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 1.8241536617279053, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.71875, - "step": 1069, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990484714508057 - }, - { - "episode": 17136, - "epoch": 0.10267102850774706, - "loss/policy_avg": 0.2009587585926056, - "lr": 9.316206543967282e-06, - "objective/entropy": -281.51422119140625, - "objective/kl": 31.799592971801758, - "objective/non_score_reward": -1.589979648590088, - "objective/rlhf_reward": -3.9599182963371273, - "objective/scores": 0.6, - "policy/approxkl_avg": 11.409127235412598, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.73046875, - "step": 1070, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9992220401763916 - }, - { - "episode": 17152, - "epoch": 0.1027668931468766, - "loss/policy_avg": 0.07947662472724915, - "lr": 9.315567484662578e-06, - "objective/entropy": -224.4807891845703, - "objective/kl": 26.412246704101562, - "objective/non_score_reward": -1.3206123113632202, - "objective/rlhf_reward": -3.1597432515778876, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 0.5046712756156921, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.568359375, - "step": 1071, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.00076961517334 - }, - { - "episode": 17168, - "epoch": 0.10286275778600616, - "loss/policy_avg": 0.06411048024892807, - "lr": 9.314928425357874e-06, - "objective/entropy": -184.87181091308594, - "objective/kl": 18.737346649169922, - "objective/non_score_reward": -0.9368672370910645, - "objective/rlhf_reward": -1.6247628948846198, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 1.1642158031463623, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.650390625, - "step": 1072, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9995265007019043 - }, - { - "episode": 17184, - "epoch": 0.1029586224251357, - "loss/policy_avg": 0.12491060793399811, - "lr": 9.31428936605317e-06, - "objective/entropy": -264.9185791015625, - "objective/kl": 33.87244415283203, - "objective/non_score_reward": -1.693622350692749, - "objective/rlhf_reward": -4.651782932058845, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 1.8209168910980225, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.638671875, - "step": 1073, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9998373985290527 - }, - { - "episode": 17200, - "epoch": 0.10305448706426526, - "loss/policy_avg": 0.18550439178943634, - "lr": 9.313650306748467e-06, - "objective/entropy": -263.8056335449219, - "objective/kl": 32.30176544189453, - "objective/non_score_reward": -1.6150879859924316, - "objective/rlhf_reward": -4.060352301597595, - "objective/scores": 0.6, - "policy/approxkl_avg": 9.517640113830566, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 1074, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998751163482666 - }, - { - "episode": 17216, - "epoch": 0.1031503517033948, - "loss/policy_avg": 0.03002159669995308, - "lr": 9.313011247443764e-06, - "objective/entropy": -127.8392562866211, - "objective/kl": 34.593231201171875, - "objective/non_score_reward": -1.729661464691162, - "objective/rlhf_reward": -4.518645679950714, - "objective/scores": 0.6, - "policy/approxkl_avg": 8.971546173095703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4599609375, - "step": 1075, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9981253147125244 - }, - { - "episode": 17232, - "epoch": 0.10324621634252436, - "loss/policy_avg": 0.13241755962371826, - "lr": 9.312372188139061e-06, - "objective/entropy": -202.40301513671875, - "objective/kl": 18.52395248413086, - "objective/non_score_reward": -0.9261976480484009, - "objective/rlhf_reward": -2.2238379148796796, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 9.288294792175293, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8125, - "step": 1076, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001105785369873 - }, - { - "episode": 17248, - "epoch": 0.1033420809816539, - "loss/policy_avg": 1.6102979183197021, - "lr": 9.311733128834356e-06, - "objective/entropy": -234.32969665527344, - "objective/kl": 31.251758575439453, - "objective/non_score_reward": -1.5625879764556885, - "objective/rlhf_reward": -4.3029405576752975, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 20.491464614868164, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.724609375, - "step": 1077, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.996706485748291 - }, - { - "episode": 17264, - "epoch": 0.10343794562078346, - "loss/policy_avg": -0.1527136266231537, - "lr": 9.311094069529653e-06, - "objective/entropy": -268.0172119140625, - "objective/kl": 27.41750144958496, - "objective/non_score_reward": -1.3708750009536743, - "objective/rlhf_reward": -1.083500242233276, - "objective/scores": 1.1, - "policy/approxkl_avg": 2.5196101665496826, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.677734375, - "step": 1078, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0008769035339355 - }, - { - "episode": 17280, - "epoch": 0.103533810259913, - "loss/policy_avg": 0.062209486961364746, - "lr": 9.310455010224948e-06, - "objective/entropy": -160.53085327148438, - "objective/kl": 35.78590774536133, - "objective/non_score_reward": -1.7892953157424927, - "objective/rlhf_reward": -5.676228764469981, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 5.662154674530029, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.609375, - "step": 1079, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9969273805618286 - }, - { - "episode": 17296, - "epoch": 0.10362967489904255, - "loss/policy_avg": 0.8675416707992554, - "lr": 9.309815950920245e-06, - "objective/entropy": -288.6915283203125, - "objective/kl": 25.7120418548584, - "objective/non_score_reward": -1.28560209274292, - "objective/rlhf_reward": -3.1949969632195785, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 3.7964463233947754, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.748046875, - "step": 1080, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.999279260635376 - }, - { - "episode": 17312, - "epoch": 0.1037255395381721, - "loss/policy_avg": 0.022417806088924408, - "lr": 9.309176891615542e-06, - "objective/entropy": -234.59405517578125, - "objective/kl": 29.527116775512695, - "objective/non_score_reward": -1.476355791091919, - "objective/rlhf_reward": -4.2435640148526295, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.2056889533996582, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.623046875, - "step": 1081, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0002384185791016 - }, - { - "episode": 17328, - "epoch": 0.10382140417730165, - "loss/policy_avg": 1.0629796981811523, - "lr": 9.308537832310839e-06, - "objective/entropy": -235.58709716796875, - "objective/kl": 24.657703399658203, - "objective/non_score_reward": -1.2328851222991943, - "objective/rlhf_reward": -3.531540727615356, - "objective/scores": 0.35, - "policy/approxkl_avg": 5.628866195678711, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.591796875, - "step": 1082, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9997105598449707 - }, - { - "episode": 17344, - "epoch": 0.1039172688164312, - "loss/policy_avg": 0.43491989374160767, - "lr": 9.307898773006136e-06, - "objective/entropy": -116.438232421875, - "objective/kl": 31.854278564453125, - "objective/non_score_reward": -1.5927139520645142, - "objective/rlhf_reward": -4.970855867862701, - "objective/scores": 0.35, - "policy/approxkl_avg": 11.138096809387207, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.716796875, - "step": 1083, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0005722045898438 - }, - { - "episode": 17360, - "epoch": 0.10401313345556075, - "loss/policy_avg": 1.154296636581421, - "lr": 9.307259713701433e-06, - "objective/entropy": -104.04910278320312, - "objective/kl": 33.66610336303711, - "objective/non_score_reward": -1.6833051443099976, - "objective/rlhf_reward": -3.8095016225588054, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 24.187870025634766, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.619140625, - "step": 1084, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990277290344238 - }, - { - "episode": 17376, - "epoch": 0.10410899809469029, - "loss/policy_avg": 2.80964732170105, - "lr": 9.306620654396728e-06, - "objective/entropy": -223.38082885742188, - "objective/kl": 42.09947967529297, - "objective/non_score_reward": -2.1049740314483643, - "objective/rlhf_reward": -6.47248501606458, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 4.849597454071045, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.630859375, - "step": 1085, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001406192779541 - }, - { - "episode": 17392, - "epoch": 0.10420486273381985, - "loss/policy_avg": 0.4371190667152405, - "lr": 9.305981595092025e-06, - "objective/entropy": -209.35194396972656, - "objective/kl": 23.755962371826172, - "objective/non_score_reward": -1.187798023223877, - "objective/rlhf_reward": -3.1470724082628063, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 1.6081452369689941, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.626953125, - "step": 1086, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9983155727386475 - }, - { - "episode": 17408, - "epoch": 0.10430072737294939, - "loss/policy_avg": 0.27756333351135254, - "lr": 9.305342535787322e-06, - "objective/entropy": -262.8760986328125, - "objective/kl": 32.76499938964844, - "objective/non_score_reward": -1.6382498741149902, - "objective/rlhf_reward": -4.948879156176167, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 24.257652282714844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.61328125, - "step": 1087, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9991706609725952 - }, - { - "episode": 17424, - "epoch": 0.10439659201207895, - "loss/policy_avg": -0.05298028513789177, - "lr": 9.304703476482619e-06, - "objective/entropy": -69.1202163696289, - "objective/kl": 30.052305221557617, - "objective/non_score_reward": -1.5026153326034546, - "objective/rlhf_reward": -6.010461330413818, - "objective/scores": 0.0, - "policy/approxkl_avg": 2.539027214050293, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4375, - "step": 1088, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001836061477661 - }, - { - "episode": 17440, - "epoch": 0.10449245665120849, - "loss/policy_avg": 0.7193084955215454, - "lr": 9.304064417177915e-06, - "objective/entropy": -143.99217224121094, - "objective/kl": 29.456846237182617, - "objective/non_score_reward": -1.4728422164916992, - "objective/rlhf_reward": -4.229509656847107, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 3.5728912353515625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6484375, - "step": 1089, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997127890586853 - }, - { - "episode": 17456, - "epoch": 0.10458832129033804, - "loss/policy_avg": 0.17522019147872925, - "lr": 9.30342535787321e-06, - "objective/entropy": -233.08404541015625, - "objective/kl": 32.47724914550781, - "objective/non_score_reward": -1.6238625049591064, - "objective/rlhf_reward": -5.116847612944943, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 33.11177444458008, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 1090, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9972889423370361 - }, - { - "episode": 17472, - "epoch": 0.10468418592946759, - "loss/policy_avg": 0.15333101153373718, - "lr": 9.302786298568508e-06, - "objective/entropy": -160.20663452148438, - "objective/kl": 36.02931594848633, - "objective/non_score_reward": -1.801465630531311, - "objective/rlhf_reward": -5.690090739520725, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 9.341711044311523, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.564453125, - "step": 1091, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000162124633789 - }, - { - "episode": 17488, - "epoch": 0.10478005056859714, - "loss/policy_avg": 0.13975301384925842, - "lr": 9.302147239263804e-06, - "objective/entropy": -148.38388061523438, - "objective/kl": 37.94308853149414, - "objective/non_score_reward": -1.8971545696258545, - "objective/rlhf_reward": -5.76378917244346, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 20.583585739135742, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6875, - "step": 1092, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9984629154205322 - }, - { - "episode": 17504, - "epoch": 0.10487591520772668, - "loss/policy_avg": 0.06423387676477432, - "lr": 9.301508179959101e-06, - "objective/entropy": -251.20310974121094, - "objective/kl": 30.99344825744629, - "objective/non_score_reward": -1.5496724843978882, - "objective/rlhf_reward": -4.873177084952516, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 74.65060424804688, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6484375, - "step": 1093, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9979256391525269 - }, - { - "episode": 17520, - "epoch": 0.10497177984685624, - "loss/policy_avg": 0.045309893786907196, - "lr": 9.300869120654398e-06, - "objective/entropy": -231.59390258789062, - "objective/kl": 39.9537353515625, - "objective/non_score_reward": -1.9976863861083984, - "objective/rlhf_reward": -5.590746021270752, - "objective/scores": 0.6, - "policy/approxkl_avg": 1.6203057765960693, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.744140625, - "step": 1094, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9998950958251953 - }, - { - "episode": 17536, - "epoch": 0.1050676444859858, - "loss/policy_avg": 0.784805953502655, - "lr": 9.300230061349695e-06, - "objective/entropy": -211.55604553222656, - "objective/kl": 30.87300682067871, - "objective/non_score_reward": -1.5436503887176514, - "objective/rlhf_reward": -3.250882480980131, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 40.055843353271484, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.708984375, - "step": 1095, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0004684925079346 - }, - { - "episode": 17552, - "epoch": 0.10516350912511534, - "loss/policy_avg": -0.08781934529542923, - "lr": 9.29959100204499e-06, - "objective/entropy": -234.98513793945312, - "objective/kl": 32.781734466552734, - "objective/non_score_reward": -1.6390867233276367, - "objective/rlhf_reward": -5.156346833705902, - "objective/scores": 0.35, - "policy/approxkl_avg": 6.987787246704102, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6953125, - "step": 1096, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0007686614990234 - }, - { - "episode": 17568, - "epoch": 0.10525937376424489, - "loss/policy_avg": 0.01477903313934803, - "lr": 9.298951942740287e-06, - "objective/entropy": -247.9517822265625, - "objective/kl": 34.785831451416016, - "objective/non_score_reward": -1.7392916679382324, - "objective/rlhf_reward": -5.009755204396184, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 1.2163832187652588, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.609375, - "step": 1097, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9993152618408203 - }, - { - "episode": 17584, - "epoch": 0.10535523840337443, - "loss/policy_avg": 0.1219930574297905, - "lr": 9.298312883435584e-06, - "objective/entropy": -219.2138671875, - "objective/kl": 25.922840118408203, - "objective/non_score_reward": -1.2961418628692627, - "objective/rlhf_reward": -3.703615131790995, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 97.83702087402344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.689453125, - "step": 1098, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.998638391494751 - }, - { - "episode": 17600, - "epoch": 0.10545110304250399, - "loss/policy_avg": 0.9850329756736755, - "lr": 9.29767382413088e-06, - "objective/entropy": -280.9995422363281, - "objective/kl": 37.09015655517578, - "objective/non_score_reward": -1.8545079231262207, - "objective/rlhf_reward": -5.470620225148137, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 15.378658294677734, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.703125, - "step": 1099, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9981629848480225 - }, - { - "episode": 17616, - "epoch": 0.10554696768163353, - "loss/policy_avg": 0.16606320440769196, - "lr": 9.297034764826178e-06, - "objective/entropy": -260.265625, - "objective/kl": 19.693069458007812, - "objective/non_score_reward": -0.9846534132957458, - "objective/rlhf_reward": -2.3344938195386704, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 10.92020320892334, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6640625, - "step": 1100, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.996177315711975 - }, - { - "episode": 17632, - "epoch": 0.10564283232076309, - "loss/policy_avg": 0.01635119318962097, - "lr": 9.296395705521473e-06, - "objective/entropy": -205.85324096679688, - "objective/kl": 33.84467697143555, - "objective/non_score_reward": -1.6922338008880615, - "objective/rlhf_reward": -5.390333392707211, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 9.969751358032227, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.669921875, - "step": 1101, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0005996227264404 - }, - { - "episode": 17648, - "epoch": 0.10573869695989263, - "loss/policy_avg": -0.09314411878585815, - "lr": 9.29575664621677e-06, - "objective/entropy": -140.11074829101562, - "objective/kl": 30.367794036865234, - "objective/non_score_reward": -1.5183897018432617, - "objective/rlhf_reward": -4.592606308873057, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 20.063873291015625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.650390625, - "step": 1102, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999199628829956 - }, - { - "episode": 17664, - "epoch": 0.10583456159902219, - "loss/policy_avg": 0.6026681661605835, - "lr": 9.295117586912065e-06, - "objective/entropy": -229.55003356933594, - "objective/kl": 40.14759826660156, - "objective/non_score_reward": -2.0073800086975098, - "objective/rlhf_reward": -6.296186701456705, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 5.752803802490234, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.609375, - "step": 1103, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.99612295627594 - }, - { - "episode": 17680, - "epoch": 0.10593042623815173, - "loss/policy_avg": 0.4246598184108734, - "lr": 9.294478527607362e-06, - "objective/entropy": -282.4384460449219, - "objective/kl": 41.07707977294922, - "objective/non_score_reward": -2.053853988647461, - "objective/rlhf_reward": -6.856166326735897, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 23.673992156982422, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.650390625, - "step": 1104, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999070405960083 - }, - { - "episode": 17696, - "epoch": 0.10602629087728128, - "loss/policy_avg": 0.37388309836387634, - "lr": 9.293839468302659e-06, - "objective/entropy": 24.34271240234375, - "objective/kl": 43.73130798339844, - "objective/non_score_reward": -2.186565399169922, - "objective/rlhf_reward": -7.295662741275176, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 5.244170188903809, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.64453125, - "step": 1105, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000379800796509 - }, - { - "episode": 17712, - "epoch": 0.10612215551641083, - "loss/policy_avg": 0.48876816034317017, - "lr": 9.293200408997956e-06, - "objective/entropy": -201.01852416992188, - "objective/kl": 26.633869171142578, - "objective/non_score_reward": -1.3316935300827026, - "objective/rlhf_reward": -3.9481719518579066, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 2.9823970794677734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.552734375, - "step": 1106, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.996859073638916 - }, - { - "episode": 17728, - "epoch": 0.10621802015554038, - "loss/policy_avg": -0.03377959132194519, - "lr": 9.292561349693252e-06, - "objective/entropy": -243.04660034179688, - "objective/kl": 32.35979080200195, - "objective/non_score_reward": -1.6179895401000977, - "objective/rlhf_reward": -5.048125703533259, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 1.477148175239563, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.65625, - "step": 1107, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.001762628555298 - }, - { - "episode": 17744, - "epoch": 0.10631388479466992, - "loss/policy_avg": -0.23846808075904846, - "lr": 9.29192229038855e-06, - "objective/entropy": -251.7974395751953, - "objective/kl": 30.760231018066406, - "objective/non_score_reward": -1.5380115509033203, - "objective/rlhf_reward": -4.701447825045928, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 8.709911346435547, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.52734375, - "step": 1108, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0022435188293457 - }, - { - "episode": 17760, - "epoch": 0.10640974943379948, - "loss/policy_avg": 0.19507169723510742, - "lr": 9.291283231083845e-06, - "objective/entropy": -236.431396484375, - "objective/kl": 29.49862289428711, - "objective/non_score_reward": -1.474931240081787, - "objective/rlhf_reward": -4.237865214765654, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 18.90414047241211, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6953125, - "step": 1109, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998689889907837 - }, - { - "episode": 17776, - "epoch": 0.10650561407292902, - "loss/policy_avg": 0.08301146328449249, - "lr": 9.290644171779141e-06, - "objective/entropy": -275.0250244140625, - "objective/kl": 41.055580139160156, - "objective/non_score_reward": -2.052779197692871, - "objective/rlhf_reward": -6.477783219019571, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 14.971565246582031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484375, - "step": 1110, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0006604194641113 - }, - { - "episode": 17792, - "epoch": 0.10660147871205858, - "loss/policy_avg": 1.2557047605514526, - "lr": 9.290005112474438e-06, - "objective/entropy": -183.14273071289062, - "objective/kl": 28.433589935302734, - "objective/non_score_reward": -1.4216796159744263, - "objective/rlhf_reward": -4.28671840429306, - "objective/scores": 0.35, - "policy/approxkl_avg": 7.521367073059082, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.771484375, - "step": 1111, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.00211763381958 - }, - { - "episode": 17808, - "epoch": 0.10669734335118812, - "loss/policy_avg": -0.1782451868057251, - "lr": 9.289366053169735e-06, - "objective/entropy": -279.40826416015625, - "objective/kl": 18.467693328857422, - "objective/non_score_reward": -0.9233846068382263, - "objective/rlhf_reward": -2.334288516376896, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 4.5754899978637695, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.685546875, - "step": 1112, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0029964447021484 - }, - { - "episode": 17824, - "epoch": 0.10679320799031768, - "loss/policy_avg": 0.03669451177120209, - "lr": 9.288726993865032e-06, - "objective/entropy": -223.73326110839844, - "objective/kl": 29.530508041381836, - "objective/non_score_reward": -1.4765253067016602, - "objective/rlhf_reward": -3.783395232931648, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 2.630830764770508, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.62109375, - "step": 1113, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9987865686416626 - }, - { - "episode": 17840, - "epoch": 0.10688907262944722, - "loss/policy_avg": 0.8654987215995789, - "lr": 9.288087934560327e-06, - "objective/entropy": -210.11935424804688, - "objective/kl": 29.22211456298828, - "objective/non_score_reward": -1.4611058235168457, - "objective/rlhf_reward": -4.288163571563318, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 0.9125807285308838, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.591796875, - "step": 1114, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.004485845565796 - }, - { - "episode": 17856, - "epoch": 0.10698493726857677, - "loss/policy_avg": -0.06222856044769287, - "lr": 9.287448875255624e-06, - "objective/entropy": -234.88995361328125, - "objective/kl": 29.992103576660156, - "objective/non_score_reward": -1.4996052980422974, - "objective/rlhf_reward": -3.074702118278715, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 3.34584903717041, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7265625, - "step": 1115, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9993617534637451 - }, - { - "episode": 17872, - "epoch": 0.10708080190770632, - "loss/policy_avg": 0.20112337172031403, - "lr": 9.286809815950921e-06, - "objective/entropy": -230.16200256347656, - "objective/kl": 30.825511932373047, - "objective/non_score_reward": -1.5412755012512207, - "objective/rlhf_reward": -4.714504103274688, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 12.865804672241211, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6953125, - "step": 1116, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9985527992248535 - }, - { - "episode": 17888, - "epoch": 0.10717666654683587, - "loss/policy_avg": 0.6556056141853333, - "lr": 9.286170756646218e-06, - "objective/entropy": -280.40069580078125, - "objective/kl": 28.695655822753906, - "objective/non_score_reward": -1.434782862663269, - "objective/rlhf_reward": -3.6164251587548595, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 2.831923007965088, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6796875, - "step": 1117, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9997923374176025 - }, - { - "episode": 17904, - "epoch": 0.10727253118596541, - "loss/policy_avg": 0.1591615378856659, - "lr": 9.285531697341515e-06, - "objective/entropy": -208.41720581054688, - "objective/kl": 32.10327911376953, - "objective/non_score_reward": -1.605163812637329, - "objective/rlhf_reward": -5.042053201285702, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 16.582778930664062, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5390625, - "step": 1118, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9988048076629639 - }, - { - "episode": 17920, - "epoch": 0.10736839582509497, - "loss/policy_avg": 0.6213997602462769, - "lr": 9.284892638036812e-06, - "objective/entropy": -174.9388427734375, - "objective/kl": 22.156795501708984, - "objective/non_score_reward": -1.107839584350586, - "objective/rlhf_reward": -2.698025361696879, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 2.6573128700256348, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4931640625, - "step": 1119, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001469612121582 - }, - { - "episode": 17936, - "epoch": 0.10746426046422451, - "loss/policy_avg": 0.15051786601543427, - "lr": 9.284253578732107e-06, - "objective/entropy": -67.49928283691406, - "objective/kl": 43.85652160644531, - "objective/non_score_reward": -2.19282603263855, - "objective/rlhf_reward": -7.3207059904054255, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 4.555420875549316, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6640625, - "step": 1120, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001720666885376 - }, - { - "episode": 17952, - "epoch": 0.10756012510335407, - "loss/policy_avg": -0.04347284138202667, - "lr": 9.283614519427404e-06, - "objective/entropy": -228.60853576660156, - "objective/kl": 27.952720642089844, - "objective/non_score_reward": -1.39763605594635, - "objective/rlhf_reward": -5.590544044971466, - "objective/scores": 0.0, - "policy/approxkl_avg": 23.599834442138672, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.76953125, - "step": 1121, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0020194053649902 - }, - { - "episode": 17968, - "epoch": 0.10765598974248361, - "loss/policy_avg": -0.053687386214733124, - "lr": 9.2829754601227e-06, - "objective/entropy": -207.92953491210938, - "objective/kl": 39.524742126464844, - "objective/non_score_reward": -1.9762370586395264, - "objective/rlhf_reward": -5.957537005619939, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 9.628499984741211, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6171875, - "step": 1122, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000182867050171 - }, - { - "episode": 17984, - "epoch": 0.10775185438161317, - "loss/policy_avg": -0.1910426765680313, - "lr": 9.282336400817996e-06, - "objective/entropy": -152.73464965820312, - "objective/kl": 33.28754425048828, - "objective/non_score_reward": -1.664376974105835, - "objective/rlhf_reward": -5.141736590655979, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 5.299195289611816, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.73828125, - "step": 1123, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9999160766601562 - }, - { - "episode": 18000, - "epoch": 0.10784771902074271, - "loss/policy_avg": 0.23040008544921875, - "lr": 9.281697341513293e-06, - "objective/entropy": -260.4175109863281, - "objective/kl": 27.83688735961914, - "objective/non_score_reward": -1.391844391822815, - "objective/rlhf_reward": -2.64365843379614, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 1.275976538658142, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.642578125, - "step": 1124, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999161720275879 - }, - { - "episode": 18016, - "epoch": 0.10794358365987226, - "loss/policy_avg": 0.38624101877212524, - "lr": 9.28105828220859e-06, - "objective/entropy": -278.8191833496094, - "objective/kl": 41.93511962890625, - "objective/non_score_reward": -2.0967559814453125, - "objective/rlhf_reward": -5.987024164199829, - "objective/scores": 0.6, - "policy/approxkl_avg": 5.510004043579102, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6875, - "step": 1125, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.000148057937622 - }, - { - "episode": 18032, - "epoch": 0.1080394482990018, - "loss/policy_avg": 0.07502768188714981, - "lr": 9.280419222903886e-06, - "objective/entropy": -261.2082824707031, - "objective/kl": 36.19464111328125, - "objective/non_score_reward": -1.80973219871521, - "objective/rlhf_reward": -5.291517208294804, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 31.399539947509766, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.599609375, - "step": 1126, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9981523752212524 - }, - { - "episode": 18048, - "epoch": 0.10813531293813136, - "loss/policy_avg": 0.027504732832312584, - "lr": 9.279780163599183e-06, - "objective/entropy": -173.93919372558594, - "objective/kl": 38.43782424926758, - "objective/non_score_reward": -1.921891212463379, - "objective/rlhf_reward": -6.236966948123321, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 4.174002647399902, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.578125, - "step": 1127, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9981164932250977 - }, - { - "episode": 18064, - "epoch": 0.1082311775772609, - "loss/policy_avg": -0.27174612879753113, - "lr": 9.279141104294478e-06, - "objective/entropy": -244.70285034179688, - "objective/kl": 29.41028594970703, - "objective/non_score_reward": -1.4705145359039307, - "objective/rlhf_reward": -5.8820579051971436, - "objective/scores": 0.0, - "policy/approxkl_avg": 13.355351448059082, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.76171875, - "step": 1128, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0021891593933105 - }, - { - "episode": 18080, - "epoch": 0.10832704221639046, - "loss/policy_avg": 0.1301630437374115, - "lr": 9.278502044989775e-06, - "objective/entropy": -217.2534942626953, - "objective/kl": 24.805774688720703, - "objective/non_score_reward": -1.2402887344360352, - "objective/rlhf_reward": -3.5105568572000116, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 179.41348266601562, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.681640625, - "step": 1129, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9998412132263184 - }, - { - "episode": 18096, - "epoch": 0.10842290685552, - "loss/policy_avg": 0.29972007870674133, - "lr": 9.277862985685072e-06, - "objective/entropy": -165.94686889648438, - "objective/kl": 33.62857437133789, - "objective/non_score_reward": -1.6814286708831787, - "objective/rlhf_reward": -5.063855295599089, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 9.789844512939453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.54296875, - "step": 1130, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.996779441833496 - }, - { - "episode": 18112, - "epoch": 0.10851877149464956, - "loss/policy_avg": -0.1860085129737854, - "lr": 9.277223926380369e-06, - "objective/entropy": -216.37200927734375, - "objective/kl": 34.99008560180664, - "objective/non_score_reward": -1.7495043277740479, - "objective/rlhf_reward": -5.517064335759043, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 1.947920799255371, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.591796875, - "step": 1131, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001011371612549 - }, - { - "episode": 18128, - "epoch": 0.1086146361337791, - "loss/policy_avg": 1.0164711475372314, - "lr": 9.276584867075666e-06, - "objective/entropy": -198.08203125, - "objective/kl": 27.897228240966797, - "objective/non_score_reward": -1.3948614597320557, - "objective/rlhf_reward": -4.253932688265962, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 3.5322012901306152, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5859375, - "step": 1132, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0027313232421875 - }, - { - "episode": 18144, - "epoch": 0.10871050077290866, - "loss/policy_avg": -0.12127675116062164, - "lr": 9.275945807770961e-06, - "objective/entropy": -220.23248291015625, - "objective/kl": 32.97924041748047, - "objective/non_score_reward": -1.6489620208740234, - "objective/rlhf_reward": -4.648436854557927, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 18.976924896240234, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.69140625, - "step": 1133, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001431465148926 - }, - { - "episode": 18160, - "epoch": 0.1088063654120382, - "loss/policy_avg": 0.2887868881225586, - "lr": 9.275306748466258e-06, - "objective/entropy": -276.16912841796875, - "objective/kl": 37.935035705566406, - "objective/non_score_reward": -1.8967517614364624, - "objective/rlhf_reward": -6.10605442803657, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 18.96986961364746, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.625, - "step": 1134, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982385635375977 - }, - { - "episode": 18176, - "epoch": 0.10890223005116775, - "loss/policy_avg": 0.448369562625885, - "lr": 9.274667689161555e-06, - "objective/entropy": -169.45448303222656, - "objective/kl": 37.67509078979492, - "objective/non_score_reward": -1.8837544918060303, - "objective/rlhf_reward": -6.084420065493926, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 27.178815841674805, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.671875, - "step": 1135, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000478506088257 - }, - { - "episode": 18192, - "epoch": 0.1089980946902973, - "loss/policy_avg": 0.5679232478141785, - "lr": 9.274028629856852e-06, - "objective/entropy": -180.7431182861328, - "objective/kl": 39.18467330932617, - "objective/non_score_reward": -1.9592337608337402, - "objective/rlhf_reward": -6.103601590792337, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 27.431682586669922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.58203125, - "step": 1136, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9959231615066528 - }, - { - "episode": 18208, - "epoch": 0.10909395932942685, - "loss/policy_avg": 0.08655049651861191, - "lr": 9.273389570552149e-06, - "objective/entropy": -251.33828735351562, - "objective/kl": 30.559293746948242, - "objective/non_score_reward": -1.52796471118927, - "objective/rlhf_reward": -4.164447735028203, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.2445521354675293, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.689453125, - "step": 1137, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9996728897094727 - }, - { - "episode": 18224, - "epoch": 0.1091898239685564, - "loss/policy_avg": 0.4302634000778198, - "lr": 9.272750511247446e-06, - "objective/entropy": -201.8494873046875, - "objective/kl": 29.424352645874023, - "objective/non_score_reward": -1.4712176322937012, - "objective/rlhf_reward": -1.4848706483840939, - "objective/scores": 1.1, - "policy/approxkl_avg": 20.697341918945312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8203125, - "step": 1138, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9989333152770996 - }, - { - "episode": 18240, - "epoch": 0.10928568860768595, - "loss/policy_avg": 0.9915270209312439, - "lr": 9.27211145194274e-06, - "objective/entropy": -195.59429931640625, - "objective/kl": 21.045230865478516, - "objective/non_score_reward": -1.052261471748352, - "objective/rlhf_reward": -2.80904603600502, - "objective/scores": 0.35, - "policy/approxkl_avg": 28.377094268798828, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.71484375, - "step": 1139, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.998765230178833 - }, - { - "episode": 18256, - "epoch": 0.10938155324681549, - "loss/policy_avg": 0.49453747272491455, - "lr": 9.271472392638038e-06, - "objective/entropy": -245.22964477539062, - "objective/kl": 32.85436248779297, - "objective/non_score_reward": -1.6427181959152222, - "objective/rlhf_reward": -5.089920165951609, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 14.714433670043945, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.599609375, - "step": 1140, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.996521234512329 - }, - { - "episode": 18272, - "epoch": 0.10947741788594505, - "loss/policy_avg": 1.36152184009552, - "lr": 9.270833333333334e-06, - "objective/entropy": -272.47137451171875, - "objective/kl": 34.61804962158203, - "objective/non_score_reward": -1.7309024333953857, - "objective/rlhf_reward": -5.581974318533568, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 4.44586181640625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.654296875, - "step": 1141, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9999563694000244 - }, - { - "episode": 18288, - "epoch": 0.10957328252507459, - "loss/policy_avg": 0.2819780111312866, - "lr": 9.270194274028631e-06, - "objective/entropy": -202.5043487548828, - "objective/kl": 25.666091918945312, - "objective/non_score_reward": -1.2833045721054077, - "objective/rlhf_reward": -3.7739683029398154, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.4244799613952637, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.552734375, - "step": 1142, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999279260635376 - }, - { - "episode": 18304, - "epoch": 0.10966914716420414, - "loss/policy_avg": 0.25256872177124023, - "lr": 9.269555214723928e-06, - "objective/entropy": -231.06277465820312, - "objective/kl": 30.289072036743164, - "objective/non_score_reward": -1.514453649520874, - "objective/rlhf_reward": -4.657814359664917, - "objective/scores": 0.35, - "policy/approxkl_avg": 17.748353958129883, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.71875, - "step": 1143, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9983248710632324 - }, - { - "episode": 18320, - "epoch": 0.10976501180333369, - "loss/policy_avg": -0.33820840716362, - "lr": 9.268916155419223e-06, - "objective/entropy": -73.95364379882812, - "objective/kl": 28.924686431884766, - "objective/non_score_reward": -1.4462342262268066, - "objective/rlhf_reward": -4.303984346802592, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 2.515535831451416, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.779296875, - "step": 1144, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0056748390197754 - }, - { - "episode": 18336, - "epoch": 0.10986087644246324, - "loss/policy_avg": 0.6078078746795654, - "lr": 9.26827709611452e-06, - "objective/entropy": -114.01469421386719, - "objective/kl": 33.08042526245117, - "objective/non_score_reward": -1.6540212631225586, - "objective/rlhf_reward": -5.13513237517631, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 18.6502628326416, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6796875, - "step": 1145, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998767614364624 - }, - { - "episode": 18352, - "epoch": 0.10995674108159278, - "loss/policy_avg": 0.34172698855400085, - "lr": 9.267638036809816e-06, - "objective/entropy": -220.97189331054688, - "objective/kl": 30.25277328491211, - "objective/non_score_reward": -1.512638807296753, - "objective/rlhf_reward": -4.317221657435099, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 1.9541758298873901, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.603515625, - "step": 1146, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0016610622406006 - }, - { - "episode": 18368, - "epoch": 0.11005260572072234, - "loss/policy_avg": 0.29632118344306946, - "lr": 9.266998977505112e-06, - "objective/entropy": -200.36410522460938, - "objective/kl": 26.179067611694336, - "objective/non_score_reward": -1.3089535236358643, - "objective/rlhf_reward": -2.835813796520233, - "objective/scores": 0.6, - "policy/approxkl_avg": 3.2951159477233887, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.576171875, - "step": 1147, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.004591464996338 - }, - { - "episode": 18384, - "epoch": 0.11014847035985188, - "loss/policy_avg": 0.011747203767299652, - "lr": 9.26635991820041e-06, - "objective/entropy": -194.40054321289062, - "objective/kl": 31.329753875732422, - "objective/non_score_reward": -1.5664877891540527, - "objective/rlhf_reward": -4.784998300488352, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 1.8787118196487427, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.466796875, - "step": 1148, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0023856163024902 - }, - { - "episode": 18400, - "epoch": 0.11024433499898144, - "loss/policy_avg": 0.46494680643081665, - "lr": 9.265720858895706e-06, - "objective/entropy": -223.58827209472656, - "objective/kl": 28.735855102539062, - "objective/non_score_reward": -1.4367928504943848, - "objective/rlhf_reward": -4.347171103954315, - "objective/scores": 0.35, - "policy/approxkl_avg": 3.884065866470337, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.490234375, - "step": 1149, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982936382293701 - }, - { - "episode": 18416, - "epoch": 0.11034019963811098, - "loss/policy_avg": 0.28439557552337646, - "lr": 9.265081799591003e-06, - "objective/entropy": -147.24366760253906, - "objective/kl": 34.880985260009766, - "objective/non_score_reward": -1.7440491914749146, - "objective/rlhf_reward": -5.242863551775614, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 79.35762023925781, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.734375, - "step": 1150, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9970107078552246 - }, - { - "episode": 18432, - "epoch": 0.11043606427724054, - "loss/policy_avg": 0.3585757613182068, - "lr": 9.2644427402863e-06, - "objective/entropy": -71.19611358642578, - "objective/kl": 26.725967407226562, - "objective/non_score_reward": -1.3362984657287598, - "objective/rlhf_reward": -3.8642411856011147, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 5.22227668762207, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4990234375, - "step": 1151, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9982261657714844 - }, - { - "episode": 18448, - "epoch": 0.11053192891637008, - "loss/policy_avg": -0.044132016599178314, - "lr": 9.263803680981595e-06, - "objective/entropy": -228.6917724609375, - "objective/kl": 28.40880584716797, - "objective/non_score_reward": -1.4204403162002563, - "objective/rlhf_reward": -5.6817615032196045, - "objective/scores": 0.0, - "policy/approxkl_avg": 1.9962811470031738, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.66015625, - "step": 1152, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9992928504943848 - }, - { - "episode": 18464, - "epoch": 0.11062779355549963, - "loss/policy_avg": 0.0064825452864170074, - "lr": 9.263164621676892e-06, - "objective/entropy": -258.4649658203125, - "objective/kl": 27.05806541442871, - "objective/non_score_reward": -1.3529033660888672, - "objective/rlhf_reward": -3.9306607274368996, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 6.10453462600708, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.546875, - "step": 1153, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0005645751953125 - }, - { - "episode": 18480, - "epoch": 0.11072365819462919, - "loss/policy_avg": -0.0035511665046215057, - "lr": 9.262525562372189e-06, - "objective/entropy": -282.88446044921875, - "objective/kl": 30.65878677368164, - "objective/non_score_reward": -1.5329391956329346, - "objective/rlhf_reward": -4.575497834888056, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 1.0608371496200562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.693359375, - "step": 1154, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0007755756378174 - }, - { - "episode": 18496, - "epoch": 0.11081952283375873, - "loss/policy_avg": 0.02788732573390007, - "lr": 9.261886503067486e-06, - "objective/entropy": -116.1088638305664, - "objective/kl": 30.207550048828125, - "objective/non_score_reward": -1.5103774070739746, - "objective/rlhf_reward": -3.918803753630195, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 75.21327209472656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.619140625, - "step": 1155, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9990315437316895 - }, - { - "episode": 18512, - "epoch": 0.11091538747288829, - "loss/policy_avg": 0.32094255089759827, - "lr": 9.261247443762783e-06, - "objective/entropy": -214.9591064453125, - "objective/kl": 27.392032623291016, - "objective/non_score_reward": -1.3696017265319824, - "objective/rlhf_reward": -3.355700316206489, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 28.679149627685547, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.572265625, - "step": 1156, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9985802173614502 - }, - { - "episode": 18528, - "epoch": 0.11101125211201783, - "loss/policy_avg": 0.20208770036697388, - "lr": 9.260608384458078e-06, - "objective/entropy": -160.56893920898438, - "objective/kl": 37.55027770996094, - "objective/non_score_reward": -1.8775138854980469, - "objective/rlhf_reward": -5.848196154058563, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 12.605989456176758, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.50390625, - "step": 1157, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999098777770996 - }, - { - "episode": 18544, - "epoch": 0.11110711675114739, - "loss/policy_avg": 0.3753480613231659, - "lr": 9.259969325153375e-06, - "objective/entropy": -242.1776123046875, - "objective/kl": 39.135337829589844, - "objective/non_score_reward": -1.9567670822143555, - "objective/rlhf_reward": -5.427068269252777, - "objective/scores": 0.6, - "policy/approxkl_avg": 35.09158706665039, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.68359375, - "step": 1158, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.99853515625 - }, - { - "episode": 18560, - "epoch": 0.11120298139027693, - "loss/policy_avg": -0.17678791284561157, - "lr": 9.259330265848672e-06, - "objective/entropy": -155.45452880859375, - "objective/kl": 29.033279418945312, - "objective/non_score_reward": -1.4516640901565552, - "objective/rlhf_reward": -4.40665636062622, - "objective/scores": 0.35, - "policy/approxkl_avg": 9.420263290405273, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.640625, - "step": 1159, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9994099140167236 - }, - { - "episode": 18576, - "epoch": 0.11129884602940648, - "loss/policy_avg": 0.2095283716917038, - "lr": 9.258691206543968e-06, - "objective/entropy": -245.34713745117188, - "objective/kl": 27.264514923095703, - "objective/non_score_reward": -1.3632256984710693, - "objective/rlhf_reward": -3.896643607822016, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 14.997028350830078, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66796875, - "step": 1160, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9966801404953003 - }, - { - "episode": 18592, - "epoch": 0.11139471066853603, - "loss/policy_avg": 0.35818007588386536, - "lr": 9.258052147239265e-06, - "objective/entropy": -235.89605712890625, - "objective/kl": 27.769607543945312, - "objective/non_score_reward": -1.3884804248809814, - "objective/rlhf_reward": -1.153921282291412, - "objective/scores": 1.1, - "policy/approxkl_avg": 22.18886375427246, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.771484375, - "step": 1161, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9975862503051758 - }, - { - "episode": 18608, - "epoch": 0.11149057530766558, - "loss/policy_avg": 0.31447115540504456, - "lr": 9.257413087934562e-06, - "objective/entropy": -129.99705505371094, - "objective/kl": 39.8328742980957, - "objective/non_score_reward": -1.9916437864303589, - "objective/rlhf_reward": -6.410315840449885, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 15.095479011535645, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7265625, - "step": 1162, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9973938465118408 - }, - { - "episode": 18624, - "epoch": 0.11158643994679512, - "loss/policy_avg": 0.0677080750465393, - "lr": 9.256774028629857e-06, - "objective/entropy": -159.806884765625, - "objective/kl": 28.60342788696289, - "objective/non_score_reward": -1.4301713705062866, - "objective/rlhf_reward": -4.3420833135522425, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 4.259771347045898, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.65625, - "step": 1163, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000201940536499 - }, - { - "episode": 18640, - "epoch": 0.11168230458592468, - "loss/policy_avg": 0.19306568801403046, - "lr": 9.256134969325154e-06, - "objective/entropy": -209.5618133544922, - "objective/kl": 19.490875244140625, - "objective/non_score_reward": -0.9745436906814575, - "objective/rlhf_reward": -0.9744557484400003, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 0.33440613746643066, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.59765625, - "step": 1164, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9999253749847412 - }, - { - "episode": 18656, - "epoch": 0.11177816922505422, - "loss/policy_avg": 0.11631269752979279, - "lr": 9.255495910020451e-06, - "objective/entropy": -141.29168701171875, - "objective/kl": 37.15015411376953, - "objective/non_score_reward": -1.8575077056884766, - "objective/rlhf_reward": -5.873771755900934, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 2.6151018142700195, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.51953125, - "step": 1165, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0023040771484375 - }, - { - "episode": 18672, - "epoch": 0.11187403386418378, - "loss/policy_avg": 0.01576380617916584, - "lr": 9.254856850715748e-06, - "objective/entropy": -215.0299835205078, - "objective/kl": 14.439537048339844, - "objective/non_score_reward": -0.7219768762588501, - "objective/rlhf_reward": -2.8879075050354004, - "objective/scores": 0.0, - "policy/approxkl_avg": 1.3693623542785645, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.509765625, - "step": 1166, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.003568172454834 - }, - { - "episode": 18688, - "epoch": 0.11196989850331332, - "loss/policy_avg": 0.08836716413497925, - "lr": 9.254217791411043e-06, - "objective/entropy": -220.81651306152344, - "objective/kl": 27.33843994140625, - "objective/non_score_reward": -1.3669219017028809, - "objective/rlhf_reward": -3.951916241439518, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 1.9005239009857178, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7109375, - "step": 1167, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.998638391494751 - }, - { - "episode": 18704, - "epoch": 0.11206576314244288, - "loss/policy_avg": 0.1386057436466217, - "lr": 9.25357873210634e-06, - "objective/entropy": -206.8209686279297, - "objective/kl": 30.620820999145508, - "objective/non_score_reward": -1.531041145324707, - "objective/rlhf_reward": -4.390831009546916, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 2.888638973236084, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6328125, - "step": 1168, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985263347625732 - }, - { - "episode": 18720, - "epoch": 0.11216162778157242, - "loss/policy_avg": 0.17286451160907745, - "lr": 9.252939672801637e-06, - "objective/entropy": -276.5692138671875, - "objective/kl": 31.233203887939453, - "objective/non_score_reward": -1.5616602897644043, - "objective/rlhf_reward": -4.730869614871677, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 7.134778022766113, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.671875, - "step": 1169, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9994456768035889 - }, - { - "episode": 18736, - "epoch": 0.11225749242070197, - "loss/policy_avg": 0.31586384773254395, - "lr": 9.252300613496932e-06, - "objective/entropy": -248.99765014648438, - "objective/kl": 33.04867172241211, - "objective/non_score_reward": -1.6524336338043213, - "objective/rlhf_reward": -5.005614492956715, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 6.374646186828613, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.626953125, - "step": 1170, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998569369316101 - }, - { - "episode": 18752, - "epoch": 0.11235335705983152, - "loss/policy_avg": -0.09263397008180618, - "lr": 9.251661554192229e-06, - "objective/entropy": -183.73135375976562, - "objective/kl": 29.070640563964844, - "objective/non_score_reward": -1.4535319805145264, - "objective/rlhf_reward": -3.9892991736260166, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 47.9519157409668, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.751953125, - "step": 1171, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0019617080688477 - }, - { - "episode": 18768, - "epoch": 0.11244922169896107, - "loss/policy_avg": -0.012390676885843277, - "lr": 9.251022494887526e-06, - "objective/entropy": -198.5019073486328, - "objective/kl": 33.66993713378906, - "objective/non_score_reward": -1.6834967136383057, - "objective/rlhf_reward": -5.333986735343933, - "objective/scores": 0.35, - "policy/approxkl_avg": 7.644756317138672, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6640625, - "step": 1172, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0004210472106934 - }, - { - "episode": 18784, - "epoch": 0.11254508633809061, - "loss/policy_avg": -0.12474697828292847, - "lr": 9.250383435582823e-06, - "objective/entropy": -258.70025634765625, - "objective/kl": 36.01386260986328, - "objective/non_score_reward": -1.8006932735443115, - "objective/rlhf_reward": -5.824170806495053, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 11.808134078979492, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.671875, - "step": 1173, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0037684440612793 - }, - { - "episode": 18800, - "epoch": 0.11264095097722017, - "loss/policy_avg": 0.06612593680620193, - "lr": 9.24974437627812e-06, - "objective/entropy": -211.03541564941406, - "objective/kl": 28.66901397705078, - "objective/non_score_reward": -1.4334505796432495, - "objective/rlhf_reward": -4.39216672471109, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 4.856602191925049, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6796875, - "step": 1174, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0004348754882812 - }, - { - "episode": 18816, - "epoch": 0.11273681561634971, - "loss/policy_avg": 0.17440900206565857, - "lr": 9.249105316973417e-06, - "objective/entropy": -233.4525146484375, - "objective/kl": 26.882205963134766, - "objective/non_score_reward": -1.344110369682312, - "objective/rlhf_reward": -3.9978394890702784, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 91.22392272949219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66796875, - "step": 1175, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9991576671600342 - }, - { - "episode": 18832, - "epoch": 0.11283268025547927, - "loss/policy_avg": 0.5238691568374634, - "lr": 9.248466257668712e-06, - "objective/entropy": -173.6719970703125, - "objective/kl": 34.459197998046875, - "objective/non_score_reward": -1.7229597568511963, - "objective/rlhf_reward": -5.335579781737879, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 25.83188247680664, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6796875, - "step": 1176, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986010789871216 - }, - { - "episode": 18848, - "epoch": 0.11292854489460881, - "loss/policy_avg": -0.12078897655010223, - "lr": 9.247827198364009e-06, - "objective/entropy": -134.12008666992188, - "objective/kl": 34.92095184326172, - "objective/non_score_reward": -1.7460476160049438, - "objective/rlhf_reward": -5.159361715587686, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 7.610663414001465, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.751953125, - "step": 1177, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0009992122650146 - }, - { - "episode": 18864, - "epoch": 0.11302440953373837, - "loss/policy_avg": 0.3817252516746521, - "lr": 9.247188139059305e-06, - "objective/entropy": -96.26307678222656, - "objective/kl": 44.49664306640625, - "objective/non_score_reward": -2.224832057952881, - "objective/rlhf_reward": -8.899328708648682, - "objective/scores": 0.0, - "policy/approxkl_avg": 10.693860054016113, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.80859375, - "step": 1178, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.997776985168457 - }, - { - "episode": 18880, - "epoch": 0.11312027417286791, - "loss/policy_avg": 0.07123968750238419, - "lr": 9.246549079754602e-06, - "objective/entropy": -199.199951171875, - "objective/kl": 27.166889190673828, - "objective/non_score_reward": -1.358344554901123, - "objective/rlhf_reward": -3.98277972182785, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 8.361668586730957, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.587890625, - "step": 1179, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9971075057983398 - }, - { - "episode": 18896, - "epoch": 0.11321613881199746, - "loss/policy_avg": 0.14846912026405334, - "lr": 9.2459100204499e-06, - "objective/entropy": -175.1884765625, - "objective/kl": 31.658098220825195, - "objective/non_score_reward": -1.5829048156738281, - "objective/rlhf_reward": -4.6697597555523975, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 0.9070639610290527, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6875, - "step": 1180, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0005788803100586 - }, - { - "episode": 18912, - "epoch": 0.113312003451127, - "loss/policy_avg": 0.3216549754142761, - "lr": 9.245270961145194e-06, - "objective/entropy": -182.9542236328125, - "objective/kl": 31.30569839477539, - "objective/non_score_reward": -1.5652849674224854, - "objective/rlhf_reward": -4.70488056441839, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 14.701448440551758, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.619140625, - "step": 1181, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9980485439300537 - }, - { - "episode": 18928, - "epoch": 0.11340786809025656, - "loss/policy_avg": 0.4251779019832611, - "lr": 9.244631901840491e-06, - "objective/entropy": -195.88975524902344, - "objective/kl": 28.441465377807617, - "objective/non_score_reward": -1.4220733642578125, - "objective/rlhf_reward": -4.0841732359567455, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 14.071691513061523, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.623046875, - "step": 1182, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9986789226531982 - }, - { - "episode": 18944, - "epoch": 0.1135037327293861, - "loss/policy_avg": 0.02354581654071808, - "lr": 9.243992842535788e-06, - "objective/entropy": -164.94105529785156, - "objective/kl": 30.754886627197266, - "objective/non_score_reward": -1.537744402885437, - "objective/rlhf_reward": -3.227258478046629, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 0.801190972328186, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.669921875, - "step": 1183, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.002558708190918 - }, - { - "episode": 18960, - "epoch": 0.11359959736851566, - "loss/policy_avg": 0.04112057387828827, - "lr": 9.243353783231085e-06, - "objective/entropy": -246.19515991210938, - "objective/kl": 34.75521469116211, - "objective/non_score_reward": -1.7377607822418213, - "objective/rlhf_reward": -5.609407058268218, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 22.861713409423828, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.66015625, - "step": 1184, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9987796545028687 - }, - { - "episode": 18976, - "epoch": 0.1136954620076452, - "loss/policy_avg": 0.21404039859771729, - "lr": 9.242714723926382e-06, - "objective/entropy": -209.3376922607422, - "objective/kl": 35.15364074707031, - "objective/non_score_reward": -1.7576820850372314, - "objective/rlhf_reward": -5.083317349629338, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 45.4697265625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.650390625, - "step": 1185, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.996396541595459 - }, - { - "episode": 18992, - "epoch": 0.11379132664677476, - "loss/policy_avg": -0.016785871237516403, - "lr": 9.242075664621679e-06, - "objective/entropy": -135.11508178710938, - "objective/kl": 44.11357879638672, - "objective/non_score_reward": -2.205678939819336, - "objective/rlhf_reward": -7.160855894506561, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 20.996137619018555, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.609375, - "step": 1186, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9976164102554321 - }, - { - "episode": 19008, - "epoch": 0.1138871912859043, - "loss/policy_avg": -0.0332149937748909, - "lr": 9.241436605316974e-06, - "objective/entropy": -85.4975814819336, - "objective/kl": 33.72305679321289, - "objective/non_score_reward": -1.6861528158187866, - "objective/rlhf_reward": -2.3446113824844357, - "objective/scores": 1.1, - "policy/approxkl_avg": 115.56517028808594, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.849609375, - "step": 1187, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9999394416809082 - }, - { - "episode": 19024, - "epoch": 0.11398305592503385, - "loss/policy_avg": 0.10150502622127533, - "lr": 9.240797546012271e-06, - "objective/entropy": -228.79638671875, - "objective/kl": 24.752819061279297, - "objective/non_score_reward": -1.2376409769058228, - "objective/rlhf_reward": -4.9505637884140015, - "objective/scores": 0.0, - "policy/approxkl_avg": 1.5263676643371582, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.517578125, - "step": 1188, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0000452995300293 - }, - { - "episode": 19040, - "epoch": 0.1140789205641634, - "loss/policy_avg": 1.6640098094940186, - "lr": 9.240158486707568e-06, - "objective/entropy": -222.82388305664062, - "objective/kl": 37.1962890625, - "objective/non_score_reward": -1.8598144054412842, - "objective/rlhf_reward": -5.958305004055857, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 26.163982391357422, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.583984375, - "step": 1189, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.998447299003601 - }, - { - "episode": 19056, - "epoch": 0.11417478520329295, - "loss/policy_avg": 0.24511002004146576, - "lr": 9.239519427402863e-06, - "objective/entropy": -169.49942016601562, - "objective/kl": 23.688583374023438, - "objective/non_score_reward": -1.1844291687011719, - "objective/rlhf_reward": -3.0758574656849964, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 25.747421264648438, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.83984375, - "step": 1190, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997087001800537 - }, - { - "episode": 19072, - "epoch": 0.1142706498424225, - "loss/policy_avg": 0.05934782326221466, - "lr": 9.23888036809816e-06, - "objective/entropy": -195.64088439941406, - "objective/kl": 33.113624572753906, - "objective/non_score_reward": -1.6556813716888428, - "objective/rlhf_reward": -5.172127108188018, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 15.268495559692383, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.765625, - "step": 1191, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.998260736465454 - }, - { - "episode": 19088, - "epoch": 0.11436651448155205, - "loss/policy_avg": 0.15776914358139038, - "lr": 9.238241308793457e-06, - "objective/entropy": -227.42486572265625, - "objective/kl": 30.864715576171875, - "objective/non_score_reward": -1.5432357788085938, - "objective/rlhf_reward": -4.722344736667022, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 2.075873613357544, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.703125, - "step": 1192, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9991707801818848 - }, - { - "episode": 19104, - "epoch": 0.11446237912068159, - "loss/policy_avg": 0.07561061531305313, - "lr": 9.237602249488754e-06, - "objective/entropy": -201.01284790039062, - "objective/kl": 36.205970764160156, - "objective/non_score_reward": -1.8102984428405762, - "objective/rlhf_reward": -5.790595392794952, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 0.3898843228816986, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.640625, - "step": 1193, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0015416145324707 - }, - { - "episode": 19120, - "epoch": 0.11455824375981115, - "loss/policy_avg": -0.5701497793197632, - "lr": 9.236963190184049e-06, - "objective/entropy": -103.09819030761719, - "objective/kl": 30.238616943359375, - "objective/non_score_reward": -1.5119309425354004, - "objective/rlhf_reward": -4.56677112263, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 4.551431655883789, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.59375, - "step": 1194, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.004034996032715 - }, - { - "episode": 19136, - "epoch": 0.11465410839894069, - "loss/policy_avg": 0.46027839183807373, - "lr": 9.236324130879346e-06, - "objective/entropy": -208.53213500976562, - "objective/kl": 33.96599197387695, - "objective/non_score_reward": -1.6982996463775635, - "objective/rlhf_reward": -4.968369956287455, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 32.79998016357422, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.76953125, - "step": 1195, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9957493543624878 - }, - { - "episode": 19152, - "epoch": 0.11474997303807025, - "loss/policy_avg": 0.7104591131210327, - "lr": 9.235685071574642e-06, - "objective/entropy": -161.5511932373047, - "objective/kl": 21.182106018066406, - "objective/non_score_reward": -1.0591052770614624, - "objective/rlhf_reward": -2.7554683117226357, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 113.69915771484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.810546875, - "step": 1196, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9965519905090332 - }, - { - "episode": 19168, - "epoch": 0.11484583767719979, - "loss/policy_avg": -0.09501040726900101, - "lr": 9.23504601226994e-06, - "objective/entropy": -149.43408203125, - "objective/kl": 36.27130126953125, - "objective/non_score_reward": -1.8135650157928467, - "objective/rlhf_reward": -7.254260301589966, - "objective/scores": 0.0, - "policy/approxkl_avg": 4.328031539916992, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.62109375, - "step": 1197, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0035271644592285 - }, - { - "episode": 19184, - "epoch": 0.11494170231632934, - "loss/policy_avg": 0.566871702671051, - "lr": 9.234406952965236e-06, - "objective/entropy": -217.4463653564453, - "objective/kl": 29.27811050415039, - "objective/non_score_reward": -1.4639055728912354, - "objective/rlhf_reward": -4.51398654869142, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 53.84545135498047, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.58203125, - "step": 1198, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9971461296081543 - }, - { - "episode": 19200, - "epoch": 0.11503756695545889, - "loss/policy_avg": 0.048794396221637726, - "lr": 9.233767893660533e-06, - "objective/entropy": -174.531005859375, - "objective/kl": 29.087738037109375, - "objective/non_score_reward": -1.4543869495391846, - "objective/rlhf_reward": -2.893828962684843, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 37.782432556152344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.791015625, - "step": 1199, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9999170303344727 - }, - { - "episode": 19216, - "epoch": 0.11513343159458844, - "loss/policy_avg": -0.18217583000659943, - "lr": 9.233128834355828e-06, - "objective/entropy": -135.63037109375, - "objective/kl": 40.30628967285156, - "objective/non_score_reward": -2.0153145790100098, - "objective/rlhf_reward": -6.545486414226231, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 8.027623176574707, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.75, - "step": 1200, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.00277042388916 - }, - { - "episode": 19232, - "epoch": 0.11522929623371798, - "loss/policy_avg": 0.19046634435653687, - "lr": 9.232489775051125e-06, - "objective/entropy": -245.74639892578125, - "objective/kl": 29.675251007080078, - "objective/non_score_reward": -1.483762502670288, - "objective/rlhf_reward": -4.575800263617916, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.7879266738891602, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.61328125, - "step": 1201, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.00600528717041 - }, - { - "episode": 19248, - "epoch": 0.11532516087284754, - "loss/policy_avg": 0.21539102494716644, - "lr": 9.231850715746422e-06, - "objective/entropy": -225.9239044189453, - "objective/kl": 28.502708435058594, - "objective/non_score_reward": -1.4251353740692139, - "objective/rlhf_reward": -4.144282071796015, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 23.979963302612305, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.712890625, - "step": 1202, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9997289180755615 - }, - { - "episode": 19264, - "epoch": 0.11542102551197708, - "loss/policy_avg": 0.04965958744287491, - "lr": 9.231211656441719e-06, - "objective/entropy": -216.04248046875, - "objective/kl": 31.314760208129883, - "objective/non_score_reward": -1.5657379627227783, - "objective/rlhf_reward": -4.1402458570161205, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 8.937564849853516, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5234375, - "step": 1203, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9976413249969482 - }, - { - "episode": 19280, - "epoch": 0.11551689015110664, - "loss/policy_avg": 0.02601933479309082, - "lr": 9.230572597137016e-06, - "objective/entropy": -148.66250610351562, - "objective/kl": 32.67079162597656, - "objective/non_score_reward": -1.6335396766662598, - "objective/rlhf_reward": -5.018386685641941, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 18.645111083984375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.79296875, - "step": 1204, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.004350185394287 - }, - { - "episode": 19296, - "epoch": 0.11561275479023618, - "loss/policy_avg": 1.4007536172866821, - "lr": 9.229933537832311e-06, - "objective/entropy": -258.147705078125, - "objective/kl": 34.21760559082031, - "objective/non_score_reward": -1.7108802795410156, - "objective/rlhf_reward": -6.843520998954773, - "objective/scores": 0.0, - "policy/approxkl_avg": 5.961824417114258, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.619140625, - "step": 1205, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0000836849212646 - }, - { - "episode": 19312, - "epoch": 0.11570861942936574, - "loss/policy_avg": -0.39194512367248535, - "lr": 9.229294478527608e-06, - "objective/entropy": -100.05964660644531, - "objective/kl": 36.88145065307617, - "objective/non_score_reward": -1.8440725803375244, - "objective/rlhf_reward": -5.551461453708718, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 8.472518920898438, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.599609375, - "step": 1206, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0011942386627197 - }, - { - "episode": 19328, - "epoch": 0.11580448406849528, - "loss/policy_avg": 0.31982097029685974, - "lr": 9.228655419222905e-06, - "objective/entropy": -219.31304931640625, - "objective/kl": 38.748992919921875, - "objective/non_score_reward": -1.9374498128890991, - "objective/rlhf_reward": -3.349799251556396, - "objective/scores": 1.1, - "policy/approxkl_avg": 78.4788818359375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.701171875, - "step": 1207, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9975740909576416 - }, - { - "episode": 19344, - "epoch": 0.11590034870762483, - "loss/policy_avg": 0.28548291325569153, - "lr": 9.228016359918202e-06, - "objective/entropy": -157.26446533203125, - "objective/kl": 40.80043029785156, - "objective/non_score_reward": -2.0400216579437256, - "objective/rlhf_reward": -6.335257883342813, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 8.33885383605957, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.767578125, - "step": 1208, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9999432563781738 - }, - { - "episode": 19360, - "epoch": 0.11599621334675438, - "loss/policy_avg": 0.03618919104337692, - "lr": 9.227377300613499e-06, - "objective/entropy": -179.19644165039062, - "objective/kl": 33.20772933959961, - "objective/non_score_reward": -1.6603864431381226, - "objective/rlhf_reward": -4.908212439219157, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 4.311634540557861, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.681640625, - "step": 1209, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9987711906433105 - }, - { - "episode": 19376, - "epoch": 0.11609207798588393, - "loss/policy_avg": 0.15800103545188904, - "lr": 9.226738241308795e-06, - "objective/entropy": -270.1763916015625, - "objective/kl": 29.480056762695312, - "objective/non_score_reward": -1.4740028381347656, - "objective/rlhf_reward": -4.570498857527895, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 2.025053024291992, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.603515625, - "step": 1210, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9997854232788086 - }, - { - "episode": 19392, - "epoch": 0.11618794262501349, - "loss/policy_avg": 0.08228084444999695, - "lr": 9.22609918200409e-06, - "objective/entropy": -265.70428466796875, - "objective/kl": 27.694522857666016, - "objective/non_score_reward": -1.3847262859344482, - "objective/rlhf_reward": -4.1389049053192135, - "objective/scores": 0.35, - "policy/approxkl_avg": 11.403460502624512, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.546875, - "step": 1211, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9995503425598145 - }, - { - "episode": 19408, - "epoch": 0.11628380726414303, - "loss/policy_avg": 0.24947790801525116, - "lr": 9.225460122699387e-06, - "objective/entropy": -214.40487670898438, - "objective/kl": 36.13543701171875, - "objective/non_score_reward": -1.8067721128463745, - "objective/rlhf_reward": -7.227088212966919, - "objective/scores": 0.0, - "policy/approxkl_avg": 13.953628540039062, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.603515625, - "step": 1212, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9974710941314697 - }, - { - "episode": 19424, - "epoch": 0.11637967190327259, - "loss/policy_avg": 0.25788062810897827, - "lr": 9.224821063394683e-06, - "objective/entropy": -189.17974853515625, - "objective/kl": 29.4897518157959, - "objective/non_score_reward": -1.4744876623153687, - "objective/rlhf_reward": -4.29383042818697, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 2.7698781490325928, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.58984375, - "step": 1213, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9973881244659424 - }, - { - "episode": 19440, - "epoch": 0.11647553654240213, - "loss/policy_avg": -0.10094030201435089, - "lr": 9.22418200408998e-06, - "objective/entropy": -178.26290893554688, - "objective/kl": 36.244503021240234, - "objective/non_score_reward": -1.812225103378296, - "objective/rlhf_reward": -5.644780430857258, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 3.1554298400878906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.673828125, - "step": 1214, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0035483837127686 - }, - { - "episode": 19456, - "epoch": 0.11657140118153168, - "loss/policy_avg": -0.2695544958114624, - "lr": 9.223542944785276e-06, - "objective/entropy": -224.8712158203125, - "objective/kl": 32.469635009765625, - "objective/non_score_reward": -1.6234817504882812, - "objective/rlhf_reward": -3.57020804727194, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 1.8041396141052246, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.65234375, - "step": 1215, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0009067058563232 - }, - { - "episode": 19472, - "epoch": 0.11666726582066123, - "loss/policy_avg": 0.19596442580223083, - "lr": 9.222903885480573e-06, - "objective/entropy": -216.5953369140625, - "objective/kl": 32.196380615234375, - "objective/non_score_reward": -1.6098190546035767, - "objective/rlhf_reward": -5.039275979995727, - "objective/scores": 0.35, - "policy/approxkl_avg": 3.0333876609802246, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6875, - "step": 1216, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0020110607147217 - }, - { - "episode": 19488, - "epoch": 0.11676313045979078, - "loss/policy_avg": -0.0265303086489439, - "lr": 9.22226482617587e-06, - "objective/entropy": -177.5919189453125, - "objective/kl": 27.259849548339844, - "objective/non_score_reward": -1.362992525100708, - "objective/rlhf_reward": -2.528251205326292, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 2.3001770973205566, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.611328125, - "step": 1217, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9996999502182007 - }, - { - "episode": 19504, - "epoch": 0.11685899509892032, - "loss/policy_avg": 0.21211574971675873, - "lr": 9.221625766871165e-06, - "objective/entropy": -239.07907104492188, - "objective/kl": 20.40753936767578, - "objective/non_score_reward": -1.0203769207000732, - "objective/rlhf_reward": -2.681507921218872, - "objective/scores": 0.35, - "policy/approxkl_avg": 2.7712390422821045, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.564453125, - "step": 1218, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0028648376464844 - }, - { - "episode": 19520, - "epoch": 0.11695485973804988, - "loss/policy_avg": 0.07235918194055557, - "lr": 9.220986707566462e-06, - "objective/entropy": -152.53878784179688, - "objective/kl": 21.97917366027832, - "objective/non_score_reward": -1.0989587306976318, - "objective/rlhf_reward": -1.4721155508768287, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 3.2449498176574707, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 1219, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.997578501701355 - }, - { - "episode": 19536, - "epoch": 0.11705072437717942, - "loss/policy_avg": 0.5660937428474426, - "lr": 9.220347648261759e-06, - "objective/entropy": -249.38014221191406, - "objective/kl": 41.259254455566406, - "objective/non_score_reward": -2.062962532043457, - "objective/rlhf_reward": -6.873248794165951, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 4.481976509094238, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.66796875, - "step": 1220, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982693195343018 - }, - { - "episode": 19552, - "epoch": 0.11714658901630898, - "loss/policy_avg": 0.01150442287325859, - "lr": 9.219708588957056e-06, - "objective/entropy": -215.35882568359375, - "objective/kl": 34.312686920166016, - "objective/non_score_reward": -1.7156343460083008, - "objective/rlhf_reward": -5.346765959056553, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 17.738174438476562, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.69140625, - "step": 1221, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9997079372406006 - }, - { - "episode": 19568, - "epoch": 0.11724245365543852, - "loss/policy_avg": -0.059517666697502136, - "lr": 9.219069529652353e-06, - "objective/entropy": -210.27809143066406, - "objective/kl": 27.43333625793457, - "objective/non_score_reward": -1.3716667890548706, - "objective/rlhf_reward": -4.005714538510203, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 16.074115753173828, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.646484375, - "step": 1222, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001340389251709 - }, - { - "episode": 19584, - "epoch": 0.11733831829456808, - "loss/policy_avg": -0.15733516216278076, - "lr": 9.21843047034765e-06, - "objective/entropy": -235.85507202148438, - "objective/kl": 28.09206199645996, - "objective/non_score_reward": -1.4046030044555664, - "objective/rlhf_reward": -4.167813996882781, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 98.53274536132812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.63671875, - "step": 1223, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0014100074768066 - }, - { - "episode": 19600, - "epoch": 0.11743418293369762, - "loss/policy_avg": 0.035943709313869476, - "lr": 9.217791411042945e-06, - "objective/entropy": -244.24017333984375, - "objective/kl": 37.203941345214844, - "objective/non_score_reward": -1.8601970672607422, - "objective/rlhf_reward": -6.115275297194643, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 3.5133166313171387, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.697265625, - "step": 1224, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9994046688079834 - }, - { - "episode": 19616, - "epoch": 0.11753004757282717, - "loss/policy_avg": 0.1306331604719162, - "lr": 9.217152351738242e-06, - "objective/entropy": -190.14393615722656, - "objective/kl": 33.84905242919922, - "objective/non_score_reward": -1.6924527883529663, - "objective/rlhf_reward": -5.254039132388767, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 3.3994088172912598, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.705078125, - "step": 1225, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9983464479446411 - }, - { - "episode": 19632, - "epoch": 0.11762591221195672, - "loss/policy_avg": 0.0009730234742164612, - "lr": 9.216513292433539e-06, - "objective/entropy": -216.55715942382812, - "objective/kl": 30.103256225585938, - "objective/non_score_reward": -1.5051627159118652, - "objective/rlhf_reward": -3.8979449889817577, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 1.51433527469635, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62109375, - "step": 1226, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0015969276428223 - }, - { - "episode": 19648, - "epoch": 0.11772177685108627, - "loss/policy_avg": 0.16437333822250366, - "lr": 9.215874233128836e-06, - "objective/entropy": -255.0314178466797, - "objective/kl": 45.41230010986328, - "objective/non_score_reward": -2.2706151008605957, - "objective/rlhf_reward": -7.478340182367878, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 13.10407829284668, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.693359375, - "step": 1227, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9988343715667725 - }, - { - "episode": 19664, - "epoch": 0.11781764149021581, - "loss/policy_avg": 0.0678139179944992, - "lr": 9.215235173824132e-06, - "objective/entropy": -190.25567626953125, - "objective/kl": 31.204730987548828, - "objective/non_score_reward": -1.5602366924285889, - "objective/rlhf_reward": -4.725174867900547, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 0.9944963455200195, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.640625, - "step": 1228, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.003727436065674 - }, - { - "episode": 19680, - "epoch": 0.11791350612934537, - "loss/policy_avg": 0.10750436782836914, - "lr": 9.21459611451943e-06, - "objective/entropy": -212.99404907226562, - "objective/kl": 31.576601028442383, - "objective/non_score_reward": -1.5788300037384033, - "objective/rlhf_reward": -3.391601239086363, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 5.086095333099365, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.71875, - "step": 1229, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9975237846374512 - }, - { - "episode": 19696, - "epoch": 0.11800937076847491, - "loss/policy_avg": 0.26910707354545593, - "lr": 9.213957055214725e-06, - "objective/entropy": -264.12017822265625, - "objective/kl": 27.552576065063477, - "objective/non_score_reward": -1.3776288032531738, - "objective/rlhf_reward": -3.7771820584932962, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 14.264134407043457, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.671875, - "step": 1230, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9975985288619995 - }, - { - "episode": 19712, - "epoch": 0.11810523540760447, - "loss/policy_avg": 0.09155163168907166, - "lr": 9.213317995910021e-06, - "objective/entropy": -141.91424560546875, - "objective/kl": 32.08643341064453, - "objective/non_score_reward": -1.6043215990066528, - "objective/rlhf_reward": -4.5924574091759425, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 0.6272682547569275, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5390625, - "step": 1231, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0007212162017822 - }, - { - "episode": 19728, - "epoch": 0.11820110004673401, - "loss/policy_avg": 0.028797071427106857, - "lr": 9.212678936605318e-06, - "objective/entropy": -301.3397216796875, - "objective/kl": 29.216651916503906, - "objective/non_score_reward": -1.4608327150344849, - "objective/rlhf_reward": -4.287071674075678, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 11.006927490234375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 1232, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9975908994674683 - }, - { - "episode": 19744, - "epoch": 0.11829696468586356, - "loss/policy_avg": 0.12966430187225342, - "lr": 9.212039877300615e-06, - "objective/entropy": -220.30935668945312, - "objective/kl": 42.5980224609375, - "objective/non_score_reward": -2.12990140914917, - "objective/rlhf_reward": -6.396899165884529, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 10.670743942260742, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.68359375, - "step": 1233, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9994142055511475 - }, - { - "episode": 19760, - "epoch": 0.1183928293249931, - "loss/policy_avg": 0.3277433514595032, - "lr": 9.21140081799591e-06, - "objective/entropy": -144.93858337402344, - "objective/kl": 34.81742858886719, - "objective/non_score_reward": -1.740871548652649, - "objective/rlhf_reward": -5.447714292796787, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 144.9310302734375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.67578125, - "step": 1234, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9978101253509521 - }, - { - "episode": 19776, - "epoch": 0.11848869396412266, - "loss/policy_avg": 0.6404599547386169, - "lr": 9.210761758691207e-06, - "objective/entropy": -259.30499267578125, - "objective/kl": 39.584476470947266, - "objective/non_score_reward": -1.9792238473892212, - "objective/rlhf_reward": -6.18356229464213, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 9.638875961303711, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.640625, - "step": 1235, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9967763423919678 - }, - { - "episode": 19792, - "epoch": 0.1185845586032522, - "loss/policy_avg": 0.20158489048480988, - "lr": 9.210122699386504e-06, - "objective/entropy": -137.64532470703125, - "objective/kl": 37.92731475830078, - "objective/non_score_reward": -1.8963658809661865, - "objective/rlhf_reward": -6.22621377680151, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 3.0997841358184814, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.603515625, - "step": 1236, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9978444576263428 - }, - { - "episode": 19808, - "epoch": 0.11868042324238176, - "loss/policy_avg": 0.013250820338726044, - "lr": 9.2094836400818e-06, - "objective/entropy": -204.8336944580078, - "objective/kl": 25.06024169921875, - "objective/non_score_reward": -1.2530121803283691, - "objective/rlhf_reward": -3.4962769387089576, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 3.133713960647583, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.55859375, - "step": 1237, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000356435775757 - }, - { - "episode": 19824, - "epoch": 0.1187762878815113, - "loss/policy_avg": 0.23657885193824768, - "lr": 9.208844580777096e-06, - "objective/entropy": -257.93719482421875, - "objective/kl": 34.67414855957031, - "objective/non_score_reward": -1.7337074279785156, - "objective/rlhf_reward": -5.201496616999308, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 40.29893112182617, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65234375, - "step": 1238, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9974265098571777 - }, - { - "episode": 19840, - "epoch": 0.11887215252064086, - "loss/policy_avg": 1.0746341943740845, - "lr": 9.208205521472393e-06, - "objective/entropy": -137.0782928466797, - "objective/kl": 38.25480270385742, - "objective/non_score_reward": -1.9127401113510132, - "objective/rlhf_reward": -6.200362186045989, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 4.038956165313721, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.57421875, - "step": 1239, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000051736831665 - }, - { - "episode": 19856, - "epoch": 0.1189680171597704, - "loss/policy_avg": 0.2902667224407196, - "lr": 9.20756646216769e-06, - "objective/entropy": -241.73587036132812, - "objective/kl": 32.18947982788086, - "objective/non_score_reward": -1.6094739437103271, - "objective/rlhf_reward": -4.613067384037088, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 10.234606742858887, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.615234375, - "step": 1240, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9977645874023438 - }, - { - "episode": 19872, - "epoch": 0.11906388179889996, - "loss/policy_avg": 0.02851104736328125, - "lr": 9.206927402862987e-06, - "objective/entropy": -160.71896362304688, - "objective/kl": 47.23845291137695, - "objective/non_score_reward": -2.3619225025177, - "objective/rlhf_reward": -7.966737869198679, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 35.956363677978516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.626953125, - "step": 1241, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000300645828247 - }, - { - "episode": 19888, - "epoch": 0.1191597464380295, - "loss/policy_avg": 0.2741260528564453, - "lr": 9.206288343558284e-06, - "objective/entropy": -148.2718963623047, - "objective/kl": 38.57466125488281, - "objective/non_score_reward": -1.9287331104278564, - "objective/rlhf_reward": -6.264334539981231, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 3.452293872833252, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.59375, - "step": 1242, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0030832290649414 - }, - { - "episode": 19904, - "epoch": 0.11925561107715905, - "loss/policy_avg": 0.5994369387626648, - "lr": 9.205649284253579e-06, - "objective/entropy": -123.61450958251953, - "objective/kl": 36.576622009277344, - "objective/non_score_reward": -1.8288313150405884, - "objective/rlhf_reward": -5.973689547091155, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 12.081830024719238, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.626953125, - "step": 1243, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998105764389038 - }, - { - "episode": 19920, - "epoch": 0.1193514757162886, - "loss/policy_avg": -0.38412266969680786, - "lr": 9.205010224948876e-06, - "objective/entropy": -250.1025848388672, - "objective/kl": 33.524559020996094, - "objective/non_score_reward": -1.6762280464172363, - "objective/rlhf_reward": -2.3049123644828793, - "objective/scores": 1.1, - "policy/approxkl_avg": 16.83417510986328, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.736328125, - "step": 1244, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.003070592880249 - }, - { - "episode": 19936, - "epoch": 0.11944734035541815, - "loss/policy_avg": 2.035850763320923, - "lr": 9.204371165644173e-06, - "objective/entropy": -190.210693359375, - "objective/kl": 26.431785583496094, - "objective/non_score_reward": -1.321589469909668, - "objective/rlhf_reward": -3.6244980148678883, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 25.93347930908203, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.736328125, - "step": 1245, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.997817039489746 - }, - { - "episode": 19952, - "epoch": 0.1195432049945477, - "loss/policy_avg": 1.105665683746338, - "lr": 9.20373210633947e-06, - "objective/entropy": -201.83714294433594, - "objective/kl": 35.3839225769043, - "objective/non_score_reward": -1.7691962718963623, - "objective/rlhf_reward": -5.698182680693966, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 6.776236534118652, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.857421875, - "step": 1246, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000749349594116 - }, - { - "episode": 19968, - "epoch": 0.11963906963367725, - "loss/policy_avg": -0.04859113693237305, - "lr": 9.203093047034766e-06, - "objective/entropy": -258.0498046875, - "objective/kl": 28.967775344848633, - "objective/non_score_reward": -1.4483888149261475, - "objective/rlhf_reward": -3.393555378913879, - "objective/scores": 0.6, - "policy/approxkl_avg": 2.895939826965332, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.681640625, - "step": 1247, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000812292098999 - }, - { - "episode": 19984, - "epoch": 0.11973493427280679, - "loss/policy_avg": 0.644065797328949, - "lr": 9.202453987730062e-06, - "objective/entropy": -258.9081726074219, - "objective/kl": 38.442054748535156, - "objective/non_score_reward": -1.922102689743042, - "objective/rlhf_reward": -6.264578659732905, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 4.295760154724121, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.67578125, - "step": 1248, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9992396831512451 - }, - { - "episode": 20000, - "epoch": 0.11983079891193635, - "loss/policy_avg": 0.9093930125236511, - "lr": 9.201814928425358e-06, - "objective/entropy": -194.09771728515625, - "objective/kl": 41.89799499511719, - "objective/non_score_reward": -2.0948996543884277, - "objective/rlhf_reward": -6.432187746243413, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 8.594277381896973, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53515625, - "step": 1249, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973087310791016 - }, - { - "episode": 20016, - "epoch": 0.11992666355106589, - "loss/policy_avg": 0.09421442449092865, - "lr": 9.201175869120655e-06, - "objective/entropy": -276.9185485839844, - "objective/kl": 33.968955993652344, - "objective/non_score_reward": -1.6984477043151855, - "objective/rlhf_reward": -5.23753175040777, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 2.0292129516601562, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.736328125, - "step": 1250, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9998764991760254 - }, - { - "episode": 20032, - "epoch": 0.12002252819019545, - "loss/policy_avg": 0.053171977400779724, - "lr": 9.200536809815952e-06, - "objective/entropy": -104.78028869628906, - "objective/kl": 29.34747314453125, - "objective/non_score_reward": -1.4673736095428467, - "objective/rlhf_reward": -4.388541939671397, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 16.4470157623291, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.76171875, - "step": 1251, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982876777648926 - }, - { - "episode": 20048, - "epoch": 0.12011839282932499, - "loss/policy_avg": 0.21370352804660797, - "lr": 9.199897750511249e-06, - "objective/entropy": -211.57241821289062, - "objective/kl": 34.70026779174805, - "objective/non_score_reward": -1.735013484954834, - "objective/rlhf_reward": -5.561452009765011, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 1.8732821941375732, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.634765625, - "step": 1252, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.002889394760132 - }, - { - "episode": 20064, - "epoch": 0.12021425746845454, - "loss/policy_avg": -0.06851379573345184, - "lr": 9.199258691206546e-06, - "objective/entropy": -247.22412109375, - "objective/kl": 24.82408905029297, - "objective/non_score_reward": -1.2412043809890747, - "objective/rlhf_reward": -4.964817762374878, - "objective/scores": 0.0, - "policy/approxkl_avg": 2.531271457672119, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.59375, - "step": 1253, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0064773559570312 - }, - { - "episode": 20080, - "epoch": 0.12031012210758409, - "loss/policy_avg": 0.9840347766876221, - "lr": 9.198619631901841e-06, - "objective/entropy": -122.53502655029297, - "objective/kl": 40.514495849609375, - "objective/non_score_reward": -2.0257248878479004, - "objective/rlhf_reward": -6.652301590056762, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 12.03805923461914, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6640625, - "step": 1254, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9971917867660522 - }, - { - "episode": 20096, - "epoch": 0.12040598674671364, - "loss/policy_avg": 0.18231819570064545, - "lr": 9.197980572597138e-06, - "objective/entropy": -241.79513549804688, - "objective/kl": 38.14476776123047, - "objective/non_score_reward": -1.907238483428955, - "objective/rlhf_reward": -6.250351646033627, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 3.30513334274292, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.646484375, - "step": 1255, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9998397827148438 - }, - { - "episode": 20112, - "epoch": 0.12050185138584318, - "loss/policy_avg": 0.23248505592346191, - "lr": 9.197341513292433e-06, - "objective/entropy": -235.57354736328125, - "objective/kl": 23.809890747070312, - "objective/non_score_reward": -1.1904945373535156, - "objective/rlhf_reward": -1.838259015918943, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 16.297555923461914, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62109375, - "step": 1256, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000349521636963 - }, - { - "episode": 20128, - "epoch": 0.12059771602497274, - "loss/policy_avg": 0.06544123589992523, - "lr": 9.19670245398773e-06, - "objective/entropy": -148.2562255859375, - "objective/kl": 42.84388732910156, - "objective/non_score_reward": -2.1421945095062256, - "objective/rlhf_reward": -7.144946057994929, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 11.275466918945312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.556640625, - "step": 1257, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001687526702881 - }, - { - "episode": 20144, - "epoch": 0.12069358066410228, - "loss/policy_avg": -0.2555674612522125, - "lr": 9.196063394683027e-06, - "objective/entropy": -262.81939697265625, - "objective/kl": 37.51679229736328, - "objective/non_score_reward": -1.8758397102355957, - "objective/rlhf_reward": -5.380652429834877, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 5.35495662689209, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.658203125, - "step": 1258, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0027287006378174 - }, - { - "episode": 20160, - "epoch": 0.12078944530323184, - "loss/policy_avg": -0.2871710956096649, - "lr": 9.195424335378324e-06, - "objective/entropy": -230.1177978515625, - "objective/kl": 37.040069580078125, - "objective/non_score_reward": -1.8520034551620483, - "objective/rlhf_reward": -7.408013701438904, - "objective/scores": 0.0, - "policy/approxkl_avg": 33.58317947387695, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.498046875, - "step": 1259, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0012998580932617 - }, - { - "episode": 20176, - "epoch": 0.12088530994236138, - "loss/policy_avg": 0.29908883571624756, - "lr": 9.19478527607362e-06, - "objective/entropy": -172.86453247070312, - "objective/kl": 45.35060501098633, - "objective/non_score_reward": -2.2675304412841797, - "objective/rlhf_reward": -7.408261661947357, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 5.0800933837890625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.69921875, - "step": 1260, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9999351501464844 - }, - { - "episode": 20192, - "epoch": 0.12098117458149094, - "loss/policy_avg": 0.25280916690826416, - "lr": 9.194146216768916e-06, - "objective/entropy": -241.52896118164062, - "objective/kl": 48.261566162109375, - "objective/non_score_reward": -2.4130783081054688, - "objective/rlhf_reward": -8.20171544990097, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 6.6377339363098145, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.705078125, - "step": 1261, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9999206066131592 - }, - { - "episode": 20208, - "epoch": 0.12107703922062048, - "loss/policy_avg": 0.3357711136341095, - "lr": 9.193507157464213e-06, - "objective/entropy": -187.2262725830078, - "objective/kl": 40.54385757446289, - "objective/non_score_reward": -2.0271928310394287, - "objective/rlhf_reward": -6.749521457885189, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 3.054933786392212, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.537109375, - "step": 1262, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999709129333496 - }, - { - "episode": 20224, - "epoch": 0.12117290385975003, - "loss/policy_avg": 0.036879949271678925, - "lr": 9.19286809815951e-06, - "objective/entropy": -289.2770690917969, - "objective/kl": 36.072391510009766, - "objective/non_score_reward": -1.803619384765625, - "objective/rlhf_reward": -5.790645618637171, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 1.7514324188232422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625, - "step": 1263, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9988840818405151 - }, - { - "episode": 20240, - "epoch": 0.12126876849887958, - "loss/policy_avg": 0.022309046238660812, - "lr": 9.192229038854807e-06, - "objective/entropy": -295.97265625, - "objective/kl": 34.17414093017578, - "objective/non_score_reward": -1.7087069749832153, - "objective/rlhf_reward": -5.509315047293825, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 2.405184507369995, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.634765625, - "step": 1264, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0027103424072266 - }, - { - "episode": 20256, - "epoch": 0.12136463313800913, - "loss/policy_avg": 0.11772053688764572, - "lr": 9.191589979550103e-06, - "objective/entropy": -216.94451904296875, - "objective/kl": 29.35517692565918, - "objective/non_score_reward": -1.4677588939666748, - "objective/rlhf_reward": -3.923624227719243, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 0.7640889883041382, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.63671875, - "step": 1265, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0017471313476562 - }, - { - "episode": 20272, - "epoch": 0.12146049777713867, - "loss/policy_avg": 0.45337143540382385, - "lr": 9.1909509202454e-06, - "objective/entropy": -248.708984375, - "objective/kl": 25.6322021484375, - "objective/non_score_reward": -1.2816100120544434, - "objective/rlhf_reward": -3.7848046331697995, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 0.588313102722168, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6875, - "step": 1266, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.001028299331665 - }, - { - "episode": 20288, - "epoch": 0.12155636241626823, - "loss/policy_avg": 0.4821030795574188, - "lr": 9.190311860940695e-06, - "objective/entropy": -246.07826232910156, - "objective/kl": 25.80655288696289, - "objective/non_score_reward": -1.2903276681900024, - "objective/rlhf_reward": -2.7613106727600094, - "objective/scores": 0.6, - "policy/approxkl_avg": 10.890335083007812, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.62109375, - "step": 1267, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9990689754486084 - }, - { - "episode": 20304, - "epoch": 0.12165222705539779, - "loss/policy_avg": 0.28960275650024414, - "lr": 9.189672801635992e-06, - "objective/entropy": -265.9043273925781, - "objective/kl": 30.99881362915039, - "objective/non_score_reward": -1.5499407052993774, - "objective/rlhf_reward": -4.077056469694648, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 13.907394409179688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.716796875, - "step": 1268, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9994606971740723 - }, - { - "episode": 20320, - "epoch": 0.12174809169452733, - "loss/policy_avg": 0.03770780563354492, - "lr": 9.18903374233129e-06, - "objective/entropy": -197.77639770507812, - "objective/kl": 29.34738540649414, - "objective/non_score_reward": -1.4673693180084229, - "objective/rlhf_reward": -4.1361438194910685, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 0.8509318828582764, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.701171875, - "step": 1269, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0020077228546143 - }, - { - "episode": 20336, - "epoch": 0.12184395633365688, - "loss/policy_avg": 0.06795699894428253, - "lr": 9.188394683026586e-06, - "objective/entropy": -213.62989807128906, - "objective/kl": 31.280406951904297, - "objective/non_score_reward": -1.5640202760696411, - "objective/rlhf_reward": -4.930568132430238, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 8.444547653198242, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5703125, - "step": 1270, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9978091716766357 - }, - { - "episode": 20352, - "epoch": 0.12193982097278643, - "loss/policy_avg": 0.43714067339897156, - "lr": 9.187755623721883e-06, - "objective/entropy": -243.02096557617188, - "objective/kl": 39.39186477661133, - "objective/non_score_reward": -1.9695932865142822, - "objective/rlhf_reward": -6.39742028992927, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 44.565582275390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.56640625, - "step": 1271, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9978324174880981 - }, - { - "episode": 20368, - "epoch": 0.12203568561191598, - "loss/policy_avg": -0.05719127878546715, - "lr": 9.187116564417178e-06, - "objective/entropy": -192.6077880859375, - "objective/kl": 32.60759735107422, - "objective/non_score_reward": -1.6303796768188477, - "objective/rlhf_reward": -5.097686667640773, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 5.910696983337402, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.572265625, - "step": 1272, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.005018472671509 - }, - { - "episode": 20384, - "epoch": 0.12213155025104552, - "loss/policy_avg": 0.10466927289962769, - "lr": 9.186477505112475e-06, - "objective/entropy": -230.49244689941406, - "objective/kl": 27.570762634277344, - "objective/non_score_reward": -1.3785381317138672, - "objective/rlhf_reward": -3.9983805058323707, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 43.770347595214844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58984375, - "step": 1273, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983129501342773 - }, - { - "episode": 20400, - "epoch": 0.12222741489017508, - "loss/policy_avg": 0.32652002573013306, - "lr": 9.185838445807772e-06, - "objective/entropy": -237.1881866455078, - "objective/kl": 35.98992919921875, - "objective/non_score_reward": -1.7994965314865112, - "objective/rlhf_reward": -5.872473273307008, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 3.772566795349121, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7265625, - "step": 1274, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0020248889923096 - }, - { - "episode": 20416, - "epoch": 0.12232327952930462, - "loss/policy_avg": -0.041273415088653564, - "lr": 9.185199386503069e-06, - "objective/entropy": -249.03428649902344, - "objective/kl": 30.465728759765625, - "objective/non_score_reward": -1.5232863426208496, - "objective/rlhf_reward": -4.359812394777934, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 34.99227523803711, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.72265625, - "step": 1275, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000488758087158 - }, - { - "episode": 20432, - "epoch": 0.12241914416843418, - "loss/policy_avg": 0.07646825909614563, - "lr": 9.184560327198366e-06, - "objective/entropy": -274.52752685546875, - "objective/kl": 28.405258178710938, - "objective/non_score_reward": -1.4202628135681152, - "objective/rlhf_reward": -4.321801745627804, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 0.582542896270752, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6015625, - "step": 1276, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000093936920166 - }, - { - "episode": 20448, - "epoch": 0.12251500880756372, - "loss/policy_avg": 0.6634305119514465, - "lr": 9.183921267893663e-06, - "objective/entropy": -237.85279846191406, - "objective/kl": 37.730873107910156, - "objective/non_score_reward": -1.88654363155365, - "objective/rlhf_reward": -6.065221848900675, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 3.300227165222168, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7421875, - "step": 1277, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.9993836879730225 - }, - { - "episode": 20464, - "epoch": 0.12261087344669327, - "loss/policy_avg": 0.32286834716796875, - "lr": 9.183282208588958e-06, - "objective/entropy": -170.94064331054688, - "objective/kl": 35.21946716308594, - "objective/non_score_reward": -1.7609732151031494, - "objective/rlhf_reward": -2.643892979621887, - "objective/scores": 1.1, - "policy/approxkl_avg": 21.075477600097656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69921875, - "step": 1278, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9982943534851074 - }, - { - "episode": 20480, - "epoch": 0.12270673808582282, - "loss/policy_avg": -0.0007353071123361588, - "lr": 9.182643149284255e-06, - "objective/entropy": -208.5531005859375, - "objective/kl": 36.26404571533203, - "objective/non_score_reward": -1.8132022619247437, - "objective/rlhf_reward": -5.874206879226071, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 1.7173817157745361, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.51953125, - "step": 1279, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9998679161071777 - }, - { - "episode": 20496, - "epoch": 0.12280260272495237, - "loss/policy_avg": 0.0016644150018692017, - "lr": 9.18200408997955e-06, - "objective/entropy": -276.2265930175781, - "objective/kl": 37.951438903808594, - "objective/non_score_reward": -1.8975720405578613, - "objective/rlhf_reward": -5.765459294590066, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 1.626516580581665, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6484375, - "step": 1280, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0007171630859375 - }, - { - "episode": 20512, - "epoch": 0.12289846736408191, - "loss/policy_avg": 0.9792773723602295, - "lr": 9.181365030674847e-06, - "objective/entropy": -181.45407104492188, - "objective/kl": 47.48221969604492, - "objective/non_score_reward": -2.3741111755371094, - "objective/rlhf_reward": -8.117842295256953, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 7.093747138977051, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6796875, - "step": 1281, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997685194015503 - }, - { - "episode": 20528, - "epoch": 0.12299433200321147, - "loss/policy_avg": 0.35386669635772705, - "lr": 9.180725971370144e-06, - "objective/entropy": -225.07867431640625, - "objective/kl": 32.99415588378906, - "objective/non_score_reward": -1.6497077941894531, - "objective/rlhf_reward": -5.174998839099971, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 36.31614685058594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7578125, - "step": 1282, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9996511936187744 - }, - { - "episode": 20544, - "epoch": 0.12309019664234101, - "loss/policy_avg": 0.9949113130569458, - "lr": 9.18008691206544e-06, - "objective/entropy": -144.422119140625, - "objective/kl": 41.22947311401367, - "objective/non_score_reward": -2.061473846435547, - "objective/rlhf_reward": -6.886644804213924, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 29.33792495727539, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.70703125, - "step": 1283, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9961674213409424 - }, - { - "episode": 20560, - "epoch": 0.12318606128147057, - "loss/policy_avg": 0.01584434136748314, - "lr": 9.179447852760737e-06, - "objective/entropy": -218.75259399414062, - "objective/kl": 29.35763168334961, - "objective/non_score_reward": -1.467881679534912, - "objective/rlhf_reward": -4.512276732657833, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 0.7336653470993042, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58203125, - "step": 1284, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0014827251434326 - }, - { - "episode": 20576, - "epoch": 0.12328192592060011, - "loss/policy_avg": 0.1053546816110611, - "lr": 9.178808793456033e-06, - "objective/entropy": -253.1468963623047, - "objective/kl": 34.82318878173828, - "objective/non_score_reward": -1.741159439086914, - "objective/rlhf_reward": -4.5646381139755245, - "objective/scores": 0.6, - "policy/approxkl_avg": 2.2562737464904785, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.609375, - "step": 1285, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9984190464019775 - }, - { - "episode": 20592, - "epoch": 0.12337779055972967, - "loss/policy_avg": 1.1642229557037354, - "lr": 9.17816973415133e-06, - "objective/entropy": -255.337646484375, - "objective/kl": 33.243751525878906, - "objective/non_score_reward": -1.6621875762939453, - "objective/rlhf_reward": -6.648750364780426, - "objective/scores": 0.0, - "policy/approxkl_avg": 38.7473030090332, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.775390625, - "step": 1286, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.997887134552002 - }, - { - "episode": 20608, - "epoch": 0.12347365519885921, - "loss/policy_avg": -0.24089229106903076, - "lr": 9.177530674846626e-06, - "objective/entropy": -243.97262573242188, - "objective/kl": 25.183528900146484, - "objective/non_score_reward": -1.259176254272461, - "objective/rlhf_reward": -3.6128731562691607, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 9.022109031677246, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.642578125, - "step": 1287, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.003718376159668 - }, - { - "episode": 20624, - "epoch": 0.12356951983798876, - "loss/policy_avg": 0.2587750554084778, - "lr": 9.176891615541923e-06, - "objective/entropy": -264.50152587890625, - "objective/kl": 47.71129608154297, - "objective/non_score_reward": -2.3855648040771484, - "objective/rlhf_reward": -7.142259335517883, - "objective/scores": 0.6, - "policy/approxkl_avg": 4.712902069091797, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.607421875, - "step": 1288, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9966511726379395 - }, - { - "episode": 20640, - "epoch": 0.1236653844771183, - "loss/policy_avg": 0.3948793411254883, - "lr": 9.17625255623722e-06, - "objective/entropy": -154.65003967285156, - "objective/kl": 40.509239196777344, - "objective/non_score_reward": -2.0254621505737305, - "objective/rlhf_reward": -6.7018486022949215, - "objective/scores": 0.35, - "policy/approxkl_avg": 3.50528621673584, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.533203125, - "step": 1289, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000455617904663 - }, - { - "episode": 20656, - "epoch": 0.12376124911624786, - "loss/policy_avg": 0.20847059786319733, - "lr": 9.175613496932517e-06, - "objective/entropy": -233.9412078857422, - "objective/kl": 41.79835510253906, - "objective/non_score_reward": -2.0899178981781006, - "objective/rlhf_reward": -7.0180361776644276, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 8.814239501953125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.607421875, - "step": 1290, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9992281198501587 - }, - { - "episode": 20672, - "epoch": 0.1238571137553774, - "loss/policy_avg": 0.34916895627975464, - "lr": 9.174974437627812e-06, - "objective/entropy": -225.4031982421875, - "objective/kl": 40.641937255859375, - "objective/non_score_reward": -2.0320968627929688, - "objective/rlhf_reward": -6.572128622737482, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 49.47692108154297, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6796875, - "step": 1291, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9976723194122314 - }, - { - "episode": 20688, - "epoch": 0.12395297839450696, - "loss/policy_avg": 2.849823474884033, - "lr": 9.174335378323109e-06, - "objective/entropy": -330.77435302734375, - "objective/kl": 23.790363311767578, - "objective/non_score_reward": -1.1895182132720947, - "objective/rlhf_reward": -3.307474712939605, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 46.90863800048828, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.603515625, - "step": 1292, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.00309157371521 - }, - { - "episode": 20704, - "epoch": 0.1240488430336365, - "loss/policy_avg": 0.20439790189266205, - "lr": 9.173696319018406e-06, - "objective/entropy": -275.361328125, - "objective/kl": 32.65497589111328, - "objective/non_score_reward": -1.6327489614486694, - "objective/rlhf_reward": -2.1309958457946774, - "objective/scores": 1.1, - "policy/approxkl_avg": 3.454024314880371, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.591796875, - "step": 1293, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9983692169189453 - }, - { - "episode": 20720, - "epoch": 0.12414470767276606, - "loss/policy_avg": 0.6102030277252197, - "lr": 9.173057259713703e-06, - "objective/entropy": -218.39520263671875, - "objective/kl": 27.34351348876953, - "objective/non_score_reward": -1.3671756982803345, - "objective/rlhf_reward": -3.9877501754120583, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 5.690610885620117, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.697265625, - "step": 1294, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999359130859375 - }, - { - "episode": 20736, - "epoch": 0.1242405723118956, - "loss/policy_avg": 0.12826263904571533, - "lr": 9.172418200409e-06, - "objective/entropy": -288.58819580078125, - "objective/kl": 36.94340133666992, - "objective/non_score_reward": -1.847170114517212, - "objective/rlhf_reward": -5.9648483588295855, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 9.69528579711914, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70703125, - "step": 1295, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9996252059936523 - }, - { - "episode": 20752, - "epoch": 0.12433643695102516, - "loss/policy_avg": -0.08016486465930939, - "lr": 9.171779141104295e-06, - "objective/entropy": -226.99656677246094, - "objective/kl": 41.39111328125, - "objective/non_score_reward": -2.0695557594299316, - "objective/rlhf_reward": -5.878223037719726, - "objective/scores": 0.6, - "policy/approxkl_avg": 1.6657519340515137, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.677734375, - "step": 1296, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000385046005249 - }, - { - "episode": 20768, - "epoch": 0.1244323015901547, - "loss/policy_avg": 0.44178086519241333, - "lr": 9.171140081799592e-06, - "objective/entropy": -236.19082641601562, - "objective/kl": 32.86880111694336, - "objective/non_score_reward": -1.6434402465820312, - "objective/rlhf_reward": -5.232125094442992, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 3.834670066833496, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.701171875, - "step": 1297, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9988516569137573 - }, - { - "episode": 20784, - "epoch": 0.12452816622928425, - "loss/policy_avg": 0.5984504818916321, - "lr": 9.170501022494889e-06, - "objective/entropy": -274.540771484375, - "objective/kl": 29.187076568603516, - "objective/non_score_reward": -1.4593539237976074, - "objective/rlhf_reward": -4.478166067336483, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.234905242919922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.73828125, - "step": 1298, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.004179000854492 - }, - { - "episode": 20800, - "epoch": 0.1246240308684138, - "loss/policy_avg": -0.41880375146865845, - "lr": 9.169861963190185e-06, - "objective/entropy": -268.99920654296875, - "objective/kl": 33.32693862915039, - "objective/non_score_reward": -1.6663470268249512, - "objective/rlhf_reward": -4.265388345718383, - "objective/scores": 0.6, - "policy/approxkl_avg": 9.45730972290039, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.66796875, - "step": 1299, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0022175312042236 - }, - { - "episode": 20816, - "epoch": 0.12471989550754335, - "loss/policy_avg": 0.5496609807014465, - "lr": 9.169222903885482e-06, - "objective/entropy": -217.79193115234375, - "objective/kl": 33.751773834228516, - "objective/non_score_reward": -1.6875885725021362, - "objective/rlhf_reward": -5.408718755751281, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 2.2236084938049316, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53125, - "step": 1300, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001005172729492 - }, - { - "episode": 20832, - "epoch": 0.1248157601466729, - "loss/policy_avg": 0.017860662192106247, - "lr": 9.168583844580777e-06, - "objective/entropy": -255.07095336914062, - "objective/kl": 19.090106964111328, - "objective/non_score_reward": -0.9545053839683533, - "objective/rlhf_reward": -1.4180216252803803, - "objective/scores": 0.6, - "policy/approxkl_avg": 11.007080078125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65234375, - "step": 1301, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.001258611679077 - }, - { - "episode": 20848, - "epoch": 0.12491162478580245, - "loss/policy_avg": 0.02041742019355297, - "lr": 9.167944785276074e-06, - "objective/entropy": -255.44552612304688, - "objective/kl": 40.95478057861328, - "objective/non_score_reward": -2.047739028930664, - "objective/rlhf_reward": -8.190956592559814, - "objective/scores": 0.0, - "policy/approxkl_avg": 5.125746250152588, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5703125, - "step": 1302, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999814748764038 - }, - { - "episode": 20864, - "epoch": 0.125007489424932, - "loss/policy_avg": 0.26476216316223145, - "lr": 9.167305725971371e-06, - "objective/entropy": -234.08668518066406, - "objective/kl": 31.28912353515625, - "objective/non_score_reward": -1.5644559860229492, - "objective/rlhf_reward": -4.310412774758275, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 26.78909683227539, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66015625, - "step": 1303, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000152111053467 - }, - { - "episode": 20880, - "epoch": 0.12510335406406153, - "loss/policy_avg": 0.7206395864486694, - "lr": 9.166666666666666e-06, - "objective/entropy": -257.04144287109375, - "objective/kl": 32.617130279541016, - "objective/non_score_reward": -1.6308565139770508, - "objective/rlhf_reward": -4.967166869845942, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 1.7028008699417114, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.76171875, - "step": 1304, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000716209411621 - }, - { - "episode": 20896, - "epoch": 0.1251992187031911, - "loss/policy_avg": 0.9150592088699341, - "lr": 9.166027607361963e-06, - "objective/entropy": -226.0206298828125, - "objective/kl": 28.190610885620117, - "objective/non_score_reward": -1.409530520439148, - "objective/rlhf_reward": -2.714403127075407, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 9.585868835449219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.578125, - "step": 1305, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9991176128387451 - }, - { - "episode": 20912, - "epoch": 0.12529508334232065, - "loss/policy_avg": 0.6741877198219299, - "lr": 9.16538854805726e-06, - "objective/entropy": -244.3083953857422, - "objective/kl": 30.657371520996094, - "objective/non_score_reward": -1.5328686237335205, - "objective/rlhf_reward": -4.731474375724792, - "objective/scores": 0.35, - "policy/approxkl_avg": 5.305037498474121, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.642578125, - "step": 1306, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9968469142913818 - }, - { - "episode": 20928, - "epoch": 0.1253909479814502, - "loss/policy_avg": 0.09786906093358994, - "lr": 9.164749488752557e-06, - "objective/entropy": -290.24542236328125, - "objective/kl": 33.52435302734375, - "objective/non_score_reward": -1.6762176752090454, - "objective/rlhf_reward": -5.254272381873474, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 2.0794928073883057, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.671875, - "step": 1307, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9996331930160522 - }, - { - "episode": 20944, - "epoch": 0.12548681262057973, - "loss/policy_avg": -0.041130807250738144, - "lr": 9.164110429447854e-06, - "objective/entropy": -246.7947235107422, - "objective/kl": 30.54619598388672, - "objective/non_score_reward": -1.5273098945617676, - "objective/rlhf_reward": -4.658641557307586, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 59.35724639892578, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.64453125, - "step": 1308, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9982240200042725 - }, - { - "episode": 20960, - "epoch": 0.12558267725970929, - "loss/policy_avg": 0.05561627447605133, - "lr": 9.163471370143149e-06, - "objective/entropy": -219.60110473632812, - "objective/kl": 30.905031204223633, - "objective/non_score_reward": -1.5452516078948975, - "objective/rlhf_reward": -4.757174451549616, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 7.214193344116211, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.58203125, - "step": 1309, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0004239082336426 - }, - { - "episode": 20976, - "epoch": 0.12567854189883884, - "loss/policy_avg": 0.42176759243011475, - "lr": 9.162832310838446e-06, - "objective/entropy": -211.8623504638672, - "objective/kl": 39.876808166503906, - "objective/non_score_reward": -1.9938405752182007, - "objective/rlhf_reward": -6.649849448233766, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 6.471524238586426, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.599609375, - "step": 1310, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0001091957092285 - }, - { - "episode": 20992, - "epoch": 0.1257744065379684, - "loss/policy_avg": 0.1641611009836197, - "lr": 9.162193251533743e-06, - "objective/entropy": -272.44757080078125, - "objective/kl": 35.32935333251953, - "objective/non_score_reward": -1.7664676904678345, - "objective/rlhf_reward": -5.706620895598812, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 16.05602264404297, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.732421875, - "step": 1311, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9982774257659912 - }, - { - "episode": 21008, - "epoch": 0.12587027117709793, - "loss/policy_avg": 0.10128459334373474, - "lr": 9.16155419222904e-06, - "objective/entropy": -218.8691864013672, - "objective/kl": 34.10152053833008, - "objective/non_score_reward": -1.7050760984420776, - "objective/rlhf_reward": -5.304532730373081, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 3.924506664276123, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.765625, - "step": 1312, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0003185272216797 - }, - { - "episode": 21024, - "epoch": 0.12596613581622748, - "loss/policy_avg": -0.07266978919506073, - "lr": 9.160915132924337e-06, - "objective/entropy": -176.869140625, - "objective/kl": 32.847267150878906, - "objective/non_score_reward": -1.6423635482788086, - "objective/rlhf_reward": -4.907594447553741, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 4.819439888000488, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.646484375, - "step": 1313, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0008370876312256 - }, - { - "episode": 21040, - "epoch": 0.12606200045535704, - "loss/policy_avg": 0.4377824664115906, - "lr": 9.160276073619634e-06, - "objective/entropy": -227.95974731445312, - "objective/kl": 32.87003707885742, - "objective/non_score_reward": -1.6435017585754395, - "objective/rlhf_reward": -5.2484944200813, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 101.58186340332031, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.771484375, - "step": 1314, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.001458168029785 - }, - { - "episode": 21056, - "epoch": 0.1261578650944866, - "loss/policy_avg": 0.029068514704704285, - "lr": 9.159637014314929e-06, - "objective/entropy": -196.00814819335938, - "objective/kl": 41.65742492675781, - "objective/non_score_reward": -2.082871437072754, - "objective/rlhf_reward": -6.208778919950996, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 3.3673386573791504, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.68359375, - "step": 1315, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9980723857879639 - }, - { - "episode": 21072, - "epoch": 0.12625372973361612, - "loss/policy_avg": 0.9132063984870911, - "lr": 9.158997955010226e-06, - "objective/entropy": -196.53677368164062, - "objective/kl": 34.47105026245117, - "objective/non_score_reward": -1.7235524654388428, - "objective/rlhf_reward": -6.8942097425460815, - "objective/scores": 0.0, - "policy/approxkl_avg": 2.7787117958068848, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4892578125, - "step": 1316, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.003011703491211 - }, - { - "episode": 21088, - "epoch": 0.12634959437274568, - "loss/policy_avg": -0.14771617949008942, - "lr": 9.158358895705522e-06, - "objective/entropy": -225.04312133789062, - "objective/kl": 25.410263061523438, - "objective/non_score_reward": -1.2705130577087402, - "objective/rlhf_reward": -5.08205258846283, - "objective/scores": 0.0, - "policy/approxkl_avg": 5.080203056335449, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.6953125, - "step": 1317, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0016398429870605 - }, - { - "episode": 21104, - "epoch": 0.12644545901187523, - "loss/policy_avg": -0.03344951570034027, - "lr": 9.15771983640082e-06, - "objective/entropy": -264.9842529296875, - "objective/kl": 34.70489501953125, - "objective/non_score_reward": -1.7352447509765625, - "objective/rlhf_reward": -4.9935674173402145, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 10.510156631469727, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.62109375, - "step": 1318, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9990448951721191 - }, - { - "episode": 21120, - "epoch": 0.1265413236510048, - "loss/policy_avg": 0.14975669980049133, - "lr": 9.157080777096116e-06, - "objective/entropy": -281.6861572265625, - "objective/kl": 38.669654846191406, - "objective/non_score_reward": -1.9334828853607178, - "objective/rlhf_reward": -6.000597850481668, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 2.9204273223876953, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.576171875, - "step": 1319, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9977920055389404 - }, - { - "episode": 21136, - "epoch": 0.12663718829013432, - "loss/policy_avg": -0.23473092913627625, - "lr": 9.156441717791411e-06, - "objective/entropy": -186.2064208984375, - "objective/kl": 33.10087203979492, - "objective/non_score_reward": -1.655043601989746, - "objective/rlhf_reward": -5.294661674529237, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 26.52355194091797, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.578125, - "step": 1320, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.002742290496826 - }, - { - "episode": 21152, - "epoch": 0.12673305292926387, - "loss/policy_avg": 0.8872619867324829, - "lr": 9.155802658486708e-06, - "objective/entropy": -265.64398193359375, - "objective/kl": 33.104408264160156, - "objective/non_score_reward": -1.6552205085754395, - "objective/rlhf_reward": -4.887548581759134, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 29.555377960205078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.69921875, - "step": 1321, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9992411136627197 - }, - { - "episode": 21168, - "epoch": 0.12682891756839343, - "loss/policy_avg": -0.05859680473804474, - "lr": 9.155163599182005e-06, - "objective/entropy": -179.13717651367188, - "objective/kl": 27.85260581970215, - "objective/non_score_reward": -1.3926303386688232, - "objective/rlhf_reward": -3.966401312414723, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 0.4925612211227417, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.611328125, - "step": 1322, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0010387897491455 - }, - { - "episode": 21184, - "epoch": 0.12692478220752298, - "loss/policy_avg": -0.12246014177799225, - "lr": 9.1545245398773e-06, - "objective/entropy": -237.7357177734375, - "objective/kl": 34.5874137878418, - "objective/non_score_reward": -1.7293705940246582, - "objective/rlhf_reward": -5.466884355159149, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 4.104196548461914, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6328125, - "step": 1323, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0004804134368896 - }, - { - "episode": 21200, - "epoch": 0.1270206468466525, - "loss/policy_avg": 0.22411450743675232, - "lr": 9.153885480572597e-06, - "objective/entropy": -273.3883361816406, - "objective/kl": 42.53919219970703, - "objective/non_score_reward": -2.1269593238830566, - "objective/rlhf_reward": -6.845978026807892, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 4.946126937866211, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.642578125, - "step": 1324, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9990512132644653 - }, - { - "episode": 21216, - "epoch": 0.12711651148578207, - "loss/policy_avg": 0.06539204716682434, - "lr": 9.153246421267894e-06, - "objective/entropy": -167.3392333984375, - "objective/kl": 33.024253845214844, - "objective/non_score_reward": -1.6512128114700317, - "objective/rlhf_reward": -5.048591821399286, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 4.672647476196289, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.40234375, - "step": 1325, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.005505084991455 - }, - { - "episode": 21232, - "epoch": 0.12721237612491162, - "loss/policy_avg": 0.17451216280460358, - "lr": 9.152607361963191e-06, - "objective/entropy": -168.8487548828125, - "objective/kl": 26.45303726196289, - "objective/non_score_reward": -1.3226518630981445, - "objective/rlhf_reward": -3.965094718962831, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 1.9421508312225342, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.51953125, - "step": 1326, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0126333236694336 - }, - { - "episode": 21248, - "epoch": 0.12730824076404118, - "loss/policy_avg": -0.005517004989087582, - "lr": 9.151968302658488e-06, - "objective/entropy": -176.31719970703125, - "objective/kl": 18.665822982788086, - "objective/non_score_reward": -0.9332911968231201, - "objective/rlhf_reward": -2.3093326284485736, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 1.5738120079040527, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.525390625, - "step": 1327, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9999518394470215 - }, - { - "episode": 21264, - "epoch": 0.12740410540317074, - "loss/policy_avg": 0.1618424952030182, - "lr": 9.151329243353783e-06, - "objective/entropy": -217.6151123046875, - "objective/kl": 24.312286376953125, - "objective/non_score_reward": -1.2156143188476562, - "objective/rlhf_reward": -2.9150461656617477, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 30.008869171142578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.74609375, - "step": 1328, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000408172607422 - }, - { - "episode": 21280, - "epoch": 0.12749997004230026, - "loss/policy_avg": 0.15089087188243866, - "lr": 9.15069018404908e-06, - "objective/entropy": -231.04893493652344, - "objective/kl": 34.12983322143555, - "objective/non_score_reward": -1.7064917087554932, - "objective/rlhf_reward": -5.48433118155542, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 37.14998245239258, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.78125, - "step": 1329, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9984626770019531 - }, - { - "episode": 21296, - "epoch": 0.12759583468142982, - "loss/policy_avg": 0.13896551728248596, - "lr": 9.150051124744377e-06, - "objective/entropy": -192.38351440429688, - "objective/kl": 22.335050582885742, - "objective/non_score_reward": -1.1167525053024292, - "objective/rlhf_reward": -3.0164120002702326, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 3.72003173828125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6875, - "step": 1330, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9977521896362305 - }, - { - "episode": 21312, - "epoch": 0.12769169932055938, - "loss/policy_avg": 0.038389697670936584, - "lr": 9.149412065439674e-06, - "objective/entropy": -206.938232421875, - "objective/kl": 23.90731430053711, - "objective/non_score_reward": -1.1953657865524292, - "objective/rlhf_reward": -3.265691363605198, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 1.9220904111862183, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.564453125, - "step": 1331, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0020275115966797 - }, - { - "episode": 21328, - "epoch": 0.12778756395968893, - "loss/policy_avg": 0.22985966503620148, - "lr": 9.14877300613497e-06, - "objective/entropy": -236.96868896484375, - "objective/kl": 20.446491241455078, - "objective/non_score_reward": -1.022324562072754, - "objective/rlhf_reward": -1.6892985463142396, - "objective/scores": 0.6, - "policy/approxkl_avg": 16.64617347717285, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.96875, - "step": 1332, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.000746726989746 - }, - { - "episode": 21344, - "epoch": 0.12788342859881846, - "loss/policy_avg": 0.3077865540981293, - "lr": 9.148133946830266e-06, - "objective/entropy": -234.60574340820312, - "objective/kl": 38.31067657470703, - "objective/non_score_reward": -1.9155337810516357, - "objective/rlhf_reward": -4.7384163483392925, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 37.95224380493164, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.73046875, - "step": 1333, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998931884765625 - }, - { - "episode": 21360, - "epoch": 0.12797929323794802, - "loss/policy_avg": 0.13958078622817993, - "lr": 9.147494887525563e-06, - "objective/entropy": -273.56170654296875, - "objective/kl": 31.292470932006836, - "objective/non_score_reward": -1.5646235942840576, - "objective/rlhf_reward": -1.8584942579269406, - "objective/scores": 1.1, - "policy/approxkl_avg": 1.8678287267684937, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.619140625, - "step": 1334, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.997137188911438 - }, - { - "episode": 21376, - "epoch": 0.12807515787707757, - "loss/policy_avg": 0.42439746856689453, - "lr": 9.14685582822086e-06, - "objective/entropy": -267.5999755859375, - "objective/kl": 33.0029296875, - "objective/non_score_reward": -1.650146484375, - "objective/rlhf_reward": -5.2750730848609635, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 4.55873441696167, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.66015625, - "step": 1335, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9997644424438477 - }, - { - "episode": 21392, - "epoch": 0.12817102251620713, - "loss/policy_avg": 0.002216493710875511, - "lr": 9.146216768916156e-06, - "objective/entropy": -226.58786010742188, - "objective/kl": 22.239288330078125, - "objective/non_score_reward": -1.111964464187622, - "objective/rlhf_reward": -2.500446598009999, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 11.510183334350586, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6953125, - "step": 1336, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985392093658447 - }, - { - "episode": 21408, - "epoch": 0.12826688715533666, - "loss/policy_avg": 0.2896654009819031, - "lr": 9.145577709611453e-06, - "objective/entropy": -275.9249267578125, - "objective/kl": 33.59234619140625, - "objective/non_score_reward": -1.679617166519165, - "objective/rlhf_reward": -4.893640096458506, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 2.7253494262695312, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.728515625, - "step": 1337, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9982388019561768 - }, - { - "episode": 21424, - "epoch": 0.1283627517944662, - "loss/policy_avg": 0.4936927556991577, - "lr": 9.14493865030675e-06, - "objective/entropy": -174.52462768554688, - "objective/kl": 30.66004180908203, - "objective/non_score_reward": -1.5330020189285278, - "objective/rlhf_reward": -4.7320081949234005, - "objective/scores": 0.35, - "policy/approxkl_avg": 3.5361199378967285, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.52734375, - "step": 1338, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999757170677185 - }, - { - "episode": 21440, - "epoch": 0.12845861643359577, - "loss/policy_avg": 0.4622963070869446, - "lr": 9.144299591002045e-06, - "objective/entropy": -278.365966796875, - "objective/kl": 37.1561393737793, - "objective/non_score_reward": -1.8578070402145386, - "objective/rlhf_reward": -5.606399412426065, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 12.119524002075195, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6640625, - "step": 1339, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9970488548278809 - }, - { - "episode": 21456, - "epoch": 0.12855448107272532, - "loss/policy_avg": 0.1313559114933014, - "lr": 9.143660531697342e-06, - "objective/entropy": -254.86607360839844, - "objective/kl": 35.33464813232422, - "objective/non_score_reward": -1.7667322158813477, - "objective/rlhf_reward": -4.143209849239561, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 22.45963478088379, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6875, - "step": 1340, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9981313943862915 - }, - { - "episode": 21472, - "epoch": 0.12865034571185485, - "loss/policy_avg": 0.38973551988601685, - "lr": 9.143021472392639e-06, - "objective/entropy": -272.54193115234375, - "objective/kl": 27.13404655456543, - "objective/non_score_reward": -1.3567023277282715, - "objective/rlhf_reward": -3.693476096789042, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 1.679624080657959, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.658203125, - "step": 1341, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9994499683380127 - }, - { - "episode": 21488, - "epoch": 0.1287462103509844, - "loss/policy_avg": 0.11905691772699356, - "lr": 9.142382413087936e-06, - "objective/entropy": -210.89501953125, - "objective/kl": 28.64351463317871, - "objective/non_score_reward": -1.432175636291504, - "objective/rlhf_reward": -2.804983530880186, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 9.039191246032715, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.603515625, - "step": 1342, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9983291625976562 - }, - { - "episode": 21504, - "epoch": 0.12884207499011396, - "loss/policy_avg": 0.14720244705677032, - "lr": 9.141743353783233e-06, - "objective/entropy": -224.0950164794922, - "objective/kl": 25.995969772338867, - "objective/non_score_reward": -1.2997984886169434, - "objective/rlhf_reward": -2.2754748209726543, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 2.1929235458374023, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.771484375, - "step": 1343, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9995906352996826 - }, - { - "episode": 21520, - "epoch": 0.12893793962924352, - "loss/policy_avg": 0.17890335619449615, - "lr": 9.14110429447853e-06, - "objective/entropy": -250.506103515625, - "objective/kl": 29.027278900146484, - "objective/non_score_reward": -1.4513640403747559, - "objective/rlhf_reward": -1.405456072092056, - "objective/scores": 1.1, - "policy/approxkl_avg": 12.065425872802734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.80078125, - "step": 1344, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9962573051452637 - }, - { - "episode": 21536, - "epoch": 0.12903380426837305, - "loss/policy_avg": 0.08815973997116089, - "lr": 9.140465235173825e-06, - "objective/entropy": -284.2688293457031, - "objective/kl": 30.836158752441406, - "objective/non_score_reward": -1.5418078899383545, - "objective/rlhf_reward": -4.5053721718197925, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 6.909310340881348, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.783203125, - "step": 1345, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 1.9966943264007568 - }, - { - "episode": 21552, - "epoch": 0.1291296689075026, - "loss/policy_avg": 0.024522747844457626, - "lr": 9.13982617586912e-06, - "objective/entropy": -161.51828002929688, - "objective/kl": 30.234634399414062, - "objective/non_score_reward": -1.5117316246032715, - "objective/rlhf_reward": -4.721413645774049, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 1.0650488138198853, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.587890625, - "step": 1346, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 2.0002496242523193 - }, - { - "episode": 21568, - "epoch": 0.12922553354663216, - "loss/policy_avg": 0.5536386966705322, - "lr": 9.139187116564417e-06, - "objective/entropy": -235.6590118408203, - "objective/kl": 32.029144287109375, - "objective/non_score_reward": -1.6014573574066162, - "objective/rlhf_reward": -4.283122958914314, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 21.44164276123047, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.619140625, - "step": 1347, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.997905969619751 - }, - { - "episode": 21584, - "epoch": 0.12932139818576172, - "loss/policy_avg": -0.14616435766220093, - "lr": 9.138548057259714e-06, - "objective/entropy": -236.11582946777344, - "objective/kl": 26.366846084594727, - "objective/non_score_reward": -1.3183423280715942, - "objective/rlhf_reward": -3.947856519251985, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 8.052356719970703, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.68359375, - "step": 1348, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.003175735473633 - }, - { - "episode": 21600, - "epoch": 0.12941726282489124, - "loss/policy_avg": 0.22792214155197144, - "lr": 9.13790899795501e-06, - "objective/entropy": -209.01907348632812, - "objective/kl": 33.44483184814453, - "objective/non_score_reward": -1.6722415685653687, - "objective/rlhf_reward": -5.31036422499786, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 15.402750015258789, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.81640625, - "step": 1349, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000248432159424 - }, - { - "episode": 21616, - "epoch": 0.1295131274640208, - "loss/policy_avg": 0.20839962363243103, - "lr": 9.137269938650308e-06, - "objective/entropy": -292.2127990722656, - "objective/kl": 29.052120208740234, - "objective/non_score_reward": -1.452605962753296, - "objective/rlhf_reward": -1.4104240894317623, - "objective/scores": 1.1, - "policy/approxkl_avg": 76.34044647216797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.580078125, - "step": 1350, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.997493028640747 - }, - { - "episode": 21632, - "epoch": 0.12960899210315036, - "loss/policy_avg": -0.08632227778434753, - "lr": 9.136630879345604e-06, - "objective/entropy": -173.5177764892578, - "objective/kl": 29.301441192626953, - "objective/non_score_reward": -1.4650721549987793, - "objective/rlhf_reward": -4.481686213103634, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 2.284684658050537, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.60546875, - "step": 1351, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.002286434173584 - }, - { - "episode": 21648, - "epoch": 0.1297048567422799, - "loss/policy_avg": 0.10895340144634247, - "lr": 9.1359918200409e-06, - "objective/entropy": -279.0048828125, - "objective/kl": 34.87440872192383, - "objective/non_score_reward": -1.7437204122543335, - "objective/rlhf_reward": -2.5748816490173336, - "objective/scores": 1.1, - "policy/approxkl_avg": 13.06716537475586, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.583984375, - "step": 1352, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.997962236404419 - }, - { - "episode": 21664, - "epoch": 0.12980072138140944, - "loss/policy_avg": 0.2365398406982422, - "lr": 9.135352760736197e-06, - "objective/entropy": -250.8545379638672, - "objective/kl": 30.62120819091797, - "objective/non_score_reward": -1.5310604572296143, - "objective/rlhf_reward": -3.2005228146326274, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 25.900188446044922, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.63671875, - "step": 1353, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.997548222541809 - }, - { - "episode": 21680, - "epoch": 0.129896586020539, - "loss/policy_avg": -0.0653112605214119, - "lr": 9.134713701431493e-06, - "objective/entropy": -263.48004150390625, - "objective/kl": 24.03810691833496, - "objective/non_score_reward": -1.2019054889678955, - "objective/rlhf_reward": -2.407621836662292, - "objective/scores": 0.6, - "policy/approxkl_avg": 2.6270973682403564, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.62890625, - "step": 1354, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.00001859664917 - }, - { - "episode": 21696, - "epoch": 0.12999245065966855, - "loss/policy_avg": 0.24467766284942627, - "lr": 9.13407464212679e-06, - "objective/entropy": -237.13613891601562, - "objective/kl": 24.655006408691406, - "objective/non_score_reward": -1.2327501773834229, - "objective/rlhf_reward": -0.5310010671615597, - "objective/scores": 1.1, - "policy/approxkl_avg": 23.784618377685547, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625, - "step": 1355, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.004101276397705 - }, - { - "episode": 21712, - "epoch": 0.1300883152987981, - "loss/policy_avg": 0.0691906288266182, - "lr": 9.133435582822087e-06, - "objective/entropy": -244.44912719726562, - "objective/kl": 30.4073486328125, - "objective/non_score_reward": -1.5203675031661987, - "objective/rlhf_reward": -1.6814697742462155, - "objective/scores": 1.1, - "policy/approxkl_avg": 0.9179539680480957, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6640625, - "step": 1356, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.001126289367676 - }, - { - "episode": 21728, - "epoch": 0.13018417993792764, - "loss/policy_avg": 0.21818453073501587, - "lr": 9.132796523517384e-06, - "objective/entropy": -227.47018432617188, - "objective/kl": 28.718124389648438, - "objective/non_score_reward": -1.435906171798706, - "objective/rlhf_reward": -3.918795819553446, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 0.8305081129074097, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.697265625, - "step": 1357, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9998927116394043 - }, - { - "episode": 21744, - "epoch": 0.1302800445770572, - "loss/policy_avg": 0.5288101434707642, - "lr": 9.13215746421268e-06, - "objective/entropy": -254.03286743164062, - "objective/kl": 40.13897705078125, - "objective/non_score_reward": -2.006948947906494, - "objective/rlhf_reward": -6.471536367145136, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 2.1408591270446777, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5859375, - "step": 1358, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989702701568604 - }, - { - "episode": 21760, - "epoch": 0.13037590921618675, - "loss/policy_avg": -0.015707701444625854, - "lr": 9.131518404907976e-06, - "objective/entropy": -235.0547637939453, - "objective/kl": 34.96942901611328, - "objective/non_score_reward": -1.7484712600708008, - "objective/rlhf_reward": -5.57005317946252, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 39.12907409667969, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.666015625, - "step": 1359, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9975736141204834 - }, - { - "episode": 21776, - "epoch": 0.1304717738553163, - "loss/policy_avg": -0.031348615884780884, - "lr": 9.130879345603273e-06, - "objective/entropy": -216.28042602539062, - "objective/kl": 31.17209243774414, - "objective/non_score_reward": -1.5586044788360596, - "objective/rlhf_reward": -4.111712040678535, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 0.8966926336288452, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53125, - "step": 1360, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001847267150879 - }, - { - "episode": 21792, - "epoch": 0.13056763849444583, - "loss/policy_avg": 0.30280882120132446, - "lr": 9.13024028629857e-06, - "objective/entropy": -213.22189331054688, - "objective/kl": 31.471433639526367, - "objective/non_score_reward": -1.573571801185608, - "objective/rlhf_reward": -4.632427459180938, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 3.3364553451538086, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.595703125, - "step": 1361, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9998619556427002 - }, - { - "episode": 21808, - "epoch": 0.1306635031335754, - "loss/policy_avg": 0.20570358633995056, - "lr": 9.129601226993867e-06, - "objective/entropy": -179.83119201660156, - "objective/kl": 25.478784561157227, - "objective/non_score_reward": -1.2739393711090088, - "objective/rlhf_reward": -3.6957572460174557, - "objective/scores": 0.35, - "policy/approxkl_avg": 90.71241760253906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.55078125, - "step": 1362, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997685194015503 - }, - { - "episode": 21824, - "epoch": 0.13075936777270494, - "loss/policy_avg": 0.6416128873825073, - "lr": 9.128962167689162e-06, - "objective/entropy": -272.2728271484375, - "objective/kl": 32.88115692138672, - "objective/non_score_reward": -1.6440578699111938, - "objective/rlhf_reward": -5.019972293582514, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 12.951141357421875, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.572265625, - "step": 1363, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9987308979034424 - }, - { - "episode": 21840, - "epoch": 0.1308552324118345, - "loss/policy_avg": -0.41209667921066284, - "lr": 9.128323108384459e-06, - "objective/entropy": -244.28286743164062, - "objective/kl": 30.94601058959961, - "objective/non_score_reward": -1.5473005771636963, - "objective/rlhf_reward": -3.2654829963457317, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 22.515792846679688, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.501953125, - "step": 1364, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0037689208984375 - }, - { - "episode": 21856, - "epoch": 0.13095109705096403, - "loss/policy_avg": -0.01563386619091034, - "lr": 9.127684049079756e-06, - "objective/entropy": -255.2119140625, - "objective/kl": 27.473278045654297, - "objective/non_score_reward": -1.3736639022827148, - "objective/rlhf_reward": -3.8905355072656445, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 2.797173261642456, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.541015625, - "step": 1365, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9988737106323242 - }, - { - "episode": 21872, - "epoch": 0.13104696169009358, - "loss/policy_avg": 0.23629775643348694, - "lr": 9.127044989775053e-06, - "objective/entropy": -272.95880126953125, - "objective/kl": 34.79148483276367, - "objective/non_score_reward": -1.7395741939544678, - "objective/rlhf_reward": -6.958296895027161, - "objective/scores": 0.0, - "policy/approxkl_avg": 3.5488548278808594, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.57421875, - "step": 1366, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 2.0178794860839844 - }, - { - "episode": 21888, - "epoch": 0.13114282632922314, - "loss/policy_avg": -0.18055079877376556, - "lr": 9.126405930470348e-06, - "objective/entropy": -238.3826904296875, - "objective/kl": 31.74860191345215, - "objective/non_score_reward": -1.5874300003051758, - "objective/rlhf_reward": -4.8991220994905085, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 6.933784484863281, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.638671875, - "step": 1367, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982986450195312 - }, - { - "episode": 21904, - "epoch": 0.1312386909683527, - "loss/policy_avg": 0.5160447359085083, - "lr": 9.125766871165645e-06, - "objective/entropy": -277.48004150390625, - "objective/kl": 33.97571563720703, - "objective/non_score_reward": -1.6987860202789307, - "objective/rlhf_reward": -4.6724373719849925, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 15.784793853759766, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6015625, - "step": 1368, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9972174167633057 - }, - { - "episode": 21920, - "epoch": 0.13133455560748222, - "loss/policy_avg": 0.09226138889789581, - "lr": 9.125127811860942e-06, - "objective/entropy": -288.6790466308594, - "objective/kl": 26.108116149902344, - "objective/non_score_reward": -1.305405855178833, - "objective/rlhf_reward": -0.8216233015060421, - "objective/scores": 1.1, - "policy/approxkl_avg": 34.192481994628906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.55078125, - "step": 1369, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9981337785720825 - }, - { - "episode": 21936, - "epoch": 0.13143042024661178, - "loss/policy_avg": -0.005547836422920227, - "lr": 9.124488752556238e-06, - "objective/entropy": -261.7656555175781, - "objective/kl": 39.76494216918945, - "objective/non_score_reward": -1.988247036933899, - "objective/rlhf_reward": -6.291128640592682, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.2966415882110596, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.63671875, - "step": 1370, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999891996383667 - }, - { - "episode": 21952, - "epoch": 0.13152628488574133, - "loss/policy_avg": 0.1910111904144287, - "lr": 9.123849693251534e-06, - "objective/entropy": -71.11714935302734, - "objective/kl": 37.65461730957031, - "objective/non_score_reward": -1.8827309608459473, - "objective/rlhf_reward": -4.607204352260801, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 7.529366493225098, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.763671875, - "step": 1371, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.99833345413208 - }, - { - "episode": 21968, - "epoch": 0.1316221495248709, - "loss/policy_avg": 1.7684245109558105, - "lr": 9.12321063394683e-06, - "objective/entropy": -231.41371154785156, - "objective/kl": 34.49894714355469, - "objective/non_score_reward": -1.7249473333358765, - "objective/rlhf_reward": -5.237929766595946, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 61.1027717590332, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5546875, - "step": 1372, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9997320175170898 - }, - { - "episode": 21984, - "epoch": 0.13171801416400042, - "loss/policy_avg": -0.03314230218529701, - "lr": 9.122571574642127e-06, - "objective/entropy": -290.369384765625, - "objective/kl": 32.8198127746582, - "objective/non_score_reward": -1.6409904956817627, - "objective/rlhf_reward": -5.2384496069251725, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 13.562549591064453, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6171875, - "step": 1373, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0022025108337402 - }, - { - "episode": 22000, - "epoch": 0.13181387880312997, - "loss/policy_avg": 0.11191444098949432, - "lr": 9.121932515337424e-06, - "objective/entropy": -228.10528564453125, - "objective/kl": 38.98127746582031, - "objective/non_score_reward": -1.9490638971328735, - "objective/rlhf_reward": -6.470742616683168, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 2.0331411361694336, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.564453125, - "step": 1374, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986833333969116 - }, - { - "episode": 22016, - "epoch": 0.13190974344225953, - "loss/policy_avg": -0.26164868474006653, - "lr": 9.121293456032721e-06, - "objective/entropy": -226.21148681640625, - "objective/kl": 34.36164855957031, - "objective/non_score_reward": -1.718082308769226, - "objective/rlhf_reward": -5.4723293542861935, - "objective/scores": 0.35, - "policy/approxkl_avg": 3.828913450241089, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.673828125, - "step": 1375, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985249042510986 - }, - { - "episode": 22032, - "epoch": 0.1320056080813891, - "loss/policy_avg": 0.21197950839996338, - "lr": 9.120654396728016e-06, - "objective/entropy": -255.317138671875, - "objective/kl": 44.30939865112305, - "objective/non_score_reward": -2.2154700756073, - "objective/rlhf_reward": -7.483277657119137, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 5.002331733703613, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.619140625, - "step": 1376, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9972087144851685 - }, - { - "episode": 22048, - "epoch": 0.13210147272051861, - "loss/policy_avg": 0.2008858621120453, - "lr": 9.120015337423313e-06, - "objective/entropy": -194.98388671875, - "objective/kl": 29.178813934326172, - "objective/non_score_reward": -1.4589406251907349, - "objective/rlhf_reward": -4.510249648123903, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 1.7124892473220825, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5546875, - "step": 1377, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.999535322189331 - }, - { - "episode": 22064, - "epoch": 0.13219733735964817, - "loss/policy_avg": 0.2521211802959442, - "lr": 9.11937627811861e-06, - "objective/entropy": -275.01416015625, - "objective/kl": 40.16548538208008, - "objective/non_score_reward": -2.008274555206299, - "objective/rlhf_reward": -6.707584772139711, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 8.357677459716797, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.640625, - "step": 1378, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9994690418243408 - }, - { - "episode": 22080, - "epoch": 0.13229320199877773, - "loss/policy_avg": 0.20335987210273743, - "lr": 9.118737218813907e-06, - "objective/entropy": -219.05987548828125, - "objective/kl": 28.999086380004883, - "objective/non_score_reward": -1.4499542713165283, - "objective/rlhf_reward": -4.421214976397854, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 1.1380681991577148, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6015625, - "step": 1379, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001814365386963 - }, - { - "episode": 22096, - "epoch": 0.13238906663790728, - "loss/policy_avg": -0.17124547064304352, - "lr": 9.118098159509204e-06, - "objective/entropy": -102.70747375488281, - "objective/kl": 36.22713851928711, - "objective/non_score_reward": -1.8113569021224976, - "objective/rlhf_reward": -5.866825440017086, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 0.37671273946762085, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.560546875, - "step": 1380, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001765251159668 - }, - { - "episode": 22112, - "epoch": 0.1324849312770368, - "loss/policy_avg": 0.6594262719154358, - "lr": 9.1174591002045e-06, - "objective/entropy": -239.76181030273438, - "objective/kl": 38.55724334716797, - "objective/non_score_reward": -1.9278624057769775, - "objective/rlhf_reward": -6.332847335425717, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 63.58997344970703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.748046875, - "step": 1381, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9986870288848877 - }, - { - "episode": 22128, - "epoch": 0.13258079591616637, - "loss/policy_avg": 0.26639020442962646, - "lr": 9.116820040899796e-06, - "objective/entropy": -228.43006896972656, - "objective/kl": 41.377357482910156, - "objective/non_score_reward": -2.0688676834106445, - "objective/rlhf_reward": -5.875471210479736, - "objective/scores": 0.6, - "policy/approxkl_avg": 24.236587524414062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.580078125, - "step": 1382, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.997843623161316 - }, - { - "episode": 22144, - "epoch": 0.13267666055529592, - "loss/policy_avg": 0.22560517489910126, - "lr": 9.116180981595093e-06, - "objective/entropy": -275.1982421875, - "objective/kl": 33.85704040527344, - "objective/non_score_reward": -1.6928520202636719, - "objective/rlhf_reward": -4.371408081054687, - "objective/scores": 0.6, - "policy/approxkl_avg": 50.331180572509766, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.595703125, - "step": 1383, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.996649980545044 - }, - { - "episode": 22160, - "epoch": 0.13277252519442548, - "loss/policy_avg": 0.3458302617073059, - "lr": 9.11554192229039e-06, - "objective/entropy": -284.34478759765625, - "objective/kl": 41.527374267578125, - "objective/non_score_reward": -2.076368808746338, - "objective/rlhf_reward": -6.6436157278424375, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 3.7493739128112793, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.615234375, - "step": 1384, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000601291656494 - }, - { - "episode": 22176, - "epoch": 0.13286838983355503, - "loss/policy_avg": 0.04170902818441391, - "lr": 9.114902862985686e-06, - "objective/entropy": -232.59671020507812, - "objective/kl": 23.403087615966797, - "objective/non_score_reward": -1.1701544523239136, - "objective/rlhf_reward": -3.2806179285049435, - "objective/scores": 0.35, - "policy/approxkl_avg": 11.652734756469727, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.642578125, - "step": 1385, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0049519538879395 - }, - { - "episode": 22192, - "epoch": 0.13296425447268456, - "loss/policy_avg": 0.7652486562728882, - "lr": 9.114263803680983e-06, - "objective/entropy": -264.1070251464844, - "objective/kl": 38.55071258544922, - "objective/non_score_reward": -1.9275355339050293, - "objective/rlhf_reward": -5.885313267978739, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 16.973102569580078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.732421875, - "step": 1386, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9949281215667725 - }, - { - "episode": 22208, - "epoch": 0.13306011911181412, - "loss/policy_avg": 0.26107269525527954, - "lr": 9.113624744376279e-06, - "objective/entropy": -274.548828125, - "objective/kl": 29.125957489013672, - "objective/non_score_reward": -1.4562978744506836, - "objective/rlhf_reward": -4.37459359607254, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 11.973018646240234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.669921875, - "step": 1387, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.997715711593628 - }, - { - "episode": 22224, - "epoch": 0.13315598375094367, - "loss/policy_avg": 0.1225675493478775, - "lr": 9.112985685071575e-06, - "objective/entropy": -218.16091918945312, - "objective/kl": 35.720855712890625, - "objective/non_score_reward": -1.7860426902770996, - "objective/rlhf_reward": -5.196759770588811, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 32.86650085449219, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8046875, - "step": 1388, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0017457008361816 - }, - { - "episode": 22240, - "epoch": 0.13325184839007323, - "loss/policy_avg": 0.343703955411911, - "lr": 9.112346625766872e-06, - "objective/entropy": -225.82559204101562, - "objective/kl": 24.557886123657227, - "objective/non_score_reward": -1.2278943061828613, - "objective/rlhf_reward": -3.5523273584589194, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.353696584701538, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.671875, - "step": 1389, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 2.00032901763916 - }, - { - "episode": 22256, - "epoch": 0.13334771302920276, - "loss/policy_avg": 0.22060903906822205, - "lr": 9.111707566462168e-06, - "objective/entropy": -259.0480651855469, - "objective/kl": 25.11700439453125, - "objective/non_score_reward": -1.2558501958847046, - "objective/rlhf_reward": -3.467141597476557, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 1.6127742528915405, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.638671875, - "step": 1390, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9998914003372192 - }, - { - "episode": 22272, - "epoch": 0.1334435776683323, - "loss/policy_avg": 0.0028184684924781322, - "lr": 9.111068507157464e-06, - "objective/entropy": -277.342041015625, - "objective/kl": 34.34157180786133, - "objective/non_score_reward": -1.7170785665512085, - "objective/rlhf_reward": -5.352542483600315, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 6.7859721183776855, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.650390625, - "step": 1391, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.9976534843444824 - }, - { - "episode": 22288, - "epoch": 0.13353944230746187, - "loss/policy_avg": 0.2673591077327728, - "lr": 9.110429447852761e-06, - "objective/entropy": -272.81146240234375, - "objective/kl": 32.08586883544922, - "objective/non_score_reward": -1.6042933464050293, - "objective/rlhf_reward": -4.993341405590144, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 66.21798706054688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.57421875, - "step": 1392, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9972776174545288 - }, - { - "episode": 22304, - "epoch": 0.13363530694659143, - "loss/policy_avg": 0.9891442060470581, - "lr": 9.109790388548058e-06, - "objective/entropy": -157.80642700195312, - "objective/kl": 41.81775665283203, - "objective/non_score_reward": -2.0908877849578857, - "objective/rlhf_reward": -7.02191524794641, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 140.4240264892578, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.56640625, - "step": 1393, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9975658655166626 - }, - { - "episode": 22320, - "epoch": 0.13373117158572095, - "loss/policy_avg": 0.6556341052055359, - "lr": 9.109151329243355e-06, - "objective/entropy": -253.7165069580078, - "objective/kl": 27.159475326538086, - "objective/non_score_reward": -1.3579738140106201, - "objective/rlhf_reward": -3.3091887853303295, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 15.467697143554688, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.47265625, - "step": 1394, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.006256580352783 - }, - { - "episode": 22336, - "epoch": 0.1338270362248505, - "loss/policy_avg": -0.04786435142159462, - "lr": 9.10851226993865e-06, - "objective/entropy": -281.2685546875, - "objective/kl": 30.538103103637695, - "objective/non_score_reward": -1.5269051790237427, - "objective/rlhf_reward": -4.591848814281162, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 12.983705520629883, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.544921875, - "step": 1395, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0013580322265625 - }, - { - "episode": 22352, - "epoch": 0.13392290086398007, - "loss/policy_avg": -0.14957058429718018, - "lr": 9.107873210633947e-06, - "objective/entropy": -260.2832946777344, - "objective/kl": 37.010498046875, - "objective/non_score_reward": -1.85052490234375, - "objective/rlhf_reward": -6.0604638366991574, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 4.588924407958984, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6875, - "step": 1396, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982390403747559 - }, - { - "episode": 22368, - "epoch": 0.13401876550310962, - "loss/policy_avg": 0.03792187571525574, - "lr": 9.107234151329244e-06, - "objective/entropy": -202.82089233398438, - "objective/kl": 28.890417098999023, - "objective/non_score_reward": -1.4445207118988037, - "objective/rlhf_reward": -4.418833160136623, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.8451006412506104, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.646484375, - "step": 1397, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0012331008911133 - }, - { - "episode": 22384, - "epoch": 0.13411463014223915, - "loss/policy_avg": 0.26423919200897217, - "lr": 9.10659509202454e-06, - "objective/entropy": -199.65274047851562, - "objective/kl": 19.250408172607422, - "objective/non_score_reward": -0.9625204205513, - "objective/rlhf_reward": -2.245961788956242, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 6.399721145629883, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.638671875, - "step": 1398, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9996623992919922 - }, - { - "episode": 22400, - "epoch": 0.1342104947813687, - "loss/policy_avg": 0.2948363423347473, - "lr": 9.105956032719838e-06, - "objective/entropy": -214.3868865966797, - "objective/kl": 25.899425506591797, - "objective/non_score_reward": -1.294971227645874, - "objective/rlhf_reward": -3.820635103915615, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 11.461451530456543, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.51171875, - "step": 1399, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9998140335083008 - }, - { - "episode": 22416, - "epoch": 0.13430635942049826, - "loss/policy_avg": -0.059535130858421326, - "lr": 9.105316973415133e-06, - "objective/entropy": -251.36669921875, - "objective/kl": 24.37100601196289, - "objective/non_score_reward": -1.218550205230713, - "objective/rlhf_reward": -3.474200969934463, - "objective/scores": 0.35, - "policy/approxkl_avg": 6.021501541137695, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5234375, - "step": 1400, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0187039375305176 - }, - { - "episode": 22432, - "epoch": 0.13440222405962782, - "loss/policy_avg": 0.09361746907234192, - "lr": 9.10467791411043e-06, - "objective/entropy": -250.89463806152344, - "objective/kl": 40.795570373535156, - "objective/non_score_reward": -2.039778470993042, - "objective/rlhf_reward": -6.759113764762878, - "objective/scores": 0.35, - "policy/approxkl_avg": 2.102271795272827, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.697265625, - "step": 1401, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9986069202423096 - }, - { - "episode": 22448, - "epoch": 0.13449808869875735, - "loss/policy_avg": 0.3411298990249634, - "lr": 9.104038854805727e-06, - "objective/entropy": -189.4188232421875, - "objective/kl": 21.962203979492188, - "objective/non_score_reward": -1.0981099605560303, - "objective/rlhf_reward": -2.9418420597032156, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 3.4877538681030273, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5234375, - "step": 1402, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997504711151123 - }, - { - "episode": 22464, - "epoch": 0.1345939533378869, - "loss/policy_avg": 0.1954708993434906, - "lr": 9.103399795501024e-06, - "objective/entropy": -215.69268798828125, - "objective/kl": 37.367271423339844, - "objective/non_score_reward": -1.868363618850708, - "objective/rlhf_reward": -6.022856573672637, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 3.3153672218322754, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.544921875, - "step": 1403, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9988248348236084 - }, - { - "episode": 22480, - "epoch": 0.13468981797701646, - "loss/policy_avg": -0.3388468027114868, - "lr": 9.10276073619632e-06, - "objective/entropy": -247.29574584960938, - "objective/kl": 33.42229080200195, - "objective/non_score_reward": -1.671114444732666, - "objective/rlhf_reward": -5.080338153902607, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 8.782747268676758, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5390625, - "step": 1404, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0028865337371826 - }, - { - "episode": 22496, - "epoch": 0.134785682616146, - "loss/policy_avg": 0.3877559304237366, - "lr": 9.102121676891617e-06, - "objective/entropy": -236.95657348632812, - "objective/kl": 35.144378662109375, - "objective/non_score_reward": -1.7572189569473267, - "objective/rlhf_reward": -4.906169237867866, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 23.945987701416016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62109375, - "step": 1405, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998518705368042 - }, - { - "episode": 22512, - "epoch": 0.13488154725527554, - "loss/policy_avg": 0.1250351220369339, - "lr": 9.101482617586912e-06, - "objective/entropy": -219.36123657226562, - "objective/kl": 27.327880859375, - "objective/non_score_reward": -1.36639404296875, - "objective/rlhf_reward": -4.041744013031093, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 17.805503845214844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.63671875, - "step": 1406, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9986976385116577 - }, - { - "episode": 22528, - "epoch": 0.1349774118944051, - "loss/policy_avg": 0.07265815138816833, - "lr": 9.10084355828221e-06, - "objective/entropy": -289.05718994140625, - "objective/kl": 26.725826263427734, - "objective/non_score_reward": -1.3362910747528076, - "objective/rlhf_reward": -3.829392933639225, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 4.363107204437256, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.603515625, - "step": 1407, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9994230270385742 - }, - { - "episode": 22544, - "epoch": 0.13507327653353465, - "loss/policy_avg": -0.3271891176700592, - "lr": 9.100204498977506e-06, - "objective/entropy": -243.45018005371094, - "objective/kl": 35.21052551269531, - "objective/non_score_reward": -1.7605262994766235, - "objective/rlhf_reward": -5.70046954443994, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 12.5887451171875, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.611328125, - "step": 1408, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9998884201049805 - }, - { - "episode": 22560, - "epoch": 0.1351691411726642, - "loss/policy_avg": 0.20253193378448486, - "lr": 9.099565439672803e-06, - "objective/entropy": -197.681640625, - "objective/kl": 29.323577880859375, - "objective/non_score_reward": -1.4661788940429688, - "objective/rlhf_reward": -3.917304108815129, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 0.8307449817657471, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65234375, - "step": 1409, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.00246524810791 - }, - { - "episode": 22576, - "epoch": 0.13526500581179374, - "loss/policy_avg": 0.3828544020652771, - "lr": 9.0989263803681e-06, - "objective/entropy": -283.22674560546875, - "objective/kl": 28.88727378845215, - "objective/non_score_reward": -1.4443637132644653, - "objective/rlhf_reward": -4.435819080382019, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 37.23944854736328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.61328125, - "step": 1410, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9961497783660889 - }, - { - "episode": 22592, - "epoch": 0.1353608704509233, - "loss/policy_avg": 0.0013767257332801819, - "lr": 9.098287321063395e-06, - "objective/entropy": -14.409706115722656, - "objective/kl": 35.32271194458008, - "objective/non_score_reward": -1.7661356925964355, - "objective/rlhf_reward": -5.705292427276058, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 19.86726188659668, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.67578125, - "step": 1411, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000868797302246 - }, - { - "episode": 22608, - "epoch": 0.13545673509005285, - "loss/policy_avg": 0.3584628403186798, - "lr": 9.097648261758692e-06, - "objective/entropy": -275.2825927734375, - "objective/kl": 30.668413162231445, - "objective/non_score_reward": -1.5334208011627197, - "objective/rlhf_reward": -4.186271618084843, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 3.2139475345611572, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 1412, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9992797374725342 - }, - { - "episode": 22624, - "epoch": 0.1355525997291824, - "loss/policy_avg": 0.07676204293966293, - "lr": 9.097009202453987e-06, - "objective/entropy": -161.75140380859375, - "objective/kl": 26.458412170410156, - "objective/non_score_reward": -1.322920560836792, - "objective/rlhf_reward": -3.8107296256378884, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 1.3583320379257202, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6484375, - "step": 1413, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001204013824463 - }, - { - "episode": 22640, - "epoch": 0.13564846436831193, - "loss/policy_avg": -0.348129540681839, - "lr": 9.096370143149284e-06, - "objective/entropy": -161.31414794921875, - "objective/kl": 40.132015228271484, - "objective/non_score_reward": -2.006600856781006, - "objective/rlhf_reward": -6.470143883433893, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 9.361883163452148, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.69921875, - "step": 1414, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000566005706787 - }, - { - "episode": 22656, - "epoch": 0.1357443290074415, - "loss/policy_avg": 0.14402732253074646, - "lr": 9.095731083844581e-06, - "objective/entropy": -256.77880859375, - "objective/kl": 29.49087142944336, - "objective/non_score_reward": -1.474543571472168, - "objective/rlhf_reward": -4.3824026224934425, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.047593355178833, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.634765625, - "step": 1415, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0007848739624023 - }, - { - "episode": 22672, - "epoch": 0.13584019364657104, - "loss/policy_avg": -0.055459946393966675, - "lr": 9.095092024539878e-06, - "objective/entropy": -226.90335083007812, - "objective/kl": 35.10498809814453, - "objective/non_score_reward": -1.7552495002746582, - "objective/rlhf_reward": -5.570399980159149, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 30.274810791015625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.69921875, - "step": 1416, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.999528408050537 - }, - { - "episode": 22688, - "epoch": 0.1359360582857006, - "loss/policy_avg": 0.27508485317230225, - "lr": 9.094452965235175e-06, - "objective/entropy": -143.74124145507812, - "objective/kl": 37.461273193359375, - "objective/non_score_reward": -1.8730638027191162, - "objective/rlhf_reward": -5.092255330085754, - "objective/scores": 0.6, - "policy/approxkl_avg": 2.542268753051758, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.36328125, - "step": 1417, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973454475402832 - }, - { - "episode": 22704, - "epoch": 0.13603192292483013, - "loss/policy_avg": 0.22670505940914154, - "lr": 9.093813905930472e-06, - "objective/entropy": -235.04212951660156, - "objective/kl": 27.81060791015625, - "objective/non_score_reward": -1.3905303478240967, - "objective/rlhf_reward": -4.046349847110447, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 18.150178909301758, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.828125, - "step": 1418, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9964942932128906 - }, - { - "episode": 22720, - "epoch": 0.13612778756395968, - "loss/policy_avg": -0.00467962771654129, - "lr": 9.093174846625767e-06, - "objective/entropy": -241.10305786132812, - "objective/kl": 36.07202911376953, - "objective/non_score_reward": -1.803601622581482, - "objective/rlhf_reward": -5.790574391086665, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 10.899436950683594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.609375, - "step": 1419, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9986159801483154 - }, - { - "episode": 22736, - "epoch": 0.13622365220308924, - "loss/policy_avg": 0.11034490168094635, - "lr": 9.092535787321064e-06, - "objective/entropy": -318.0650939941406, - "objective/kl": 31.329748153686523, - "objective/non_score_reward": -1.5664875507354736, - "objective/rlhf_reward": -4.5326165119806925, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 1.8243210315704346, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.66015625, - "step": 1420, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999915599822998 - }, - { - "episode": 22752, - "epoch": 0.1363195168422188, - "loss/policy_avg": 0.04376043379306793, - "lr": 9.09189672801636e-06, - "objective/entropy": -257.55560302734375, - "objective/kl": 39.71128845214844, - "objective/non_score_reward": -1.9855643510818481, - "objective/rlhf_reward": -5.542257642745971, - "objective/scores": 0.6, - "policy/approxkl_avg": 33.98119354248047, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.681640625, - "step": 1421, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.9983115196228027 - }, - { - "episode": 22768, - "epoch": 0.13641538148134832, - "loss/policy_avg": 1.4271972179412842, - "lr": 9.091257668711657e-06, - "objective/entropy": -200.30636596679688, - "objective/kl": 37.72491455078125, - "objective/non_score_reward": -1.8862457275390625, - "objective/rlhf_reward": -5.422276677862678, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 4.053761005401611, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6171875, - "step": 1422, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0006790161132812 - }, - { - "episode": 22784, - "epoch": 0.13651124612047788, - "loss/policy_avg": 0.14689955115318298, - "lr": 9.090618609406954e-06, - "objective/entropy": -185.34646606445312, - "objective/kl": 29.554128646850586, - "objective/non_score_reward": -1.4777064323425293, - "objective/rlhf_reward": -3.7881193778672557, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 2.1496503353118896, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.537109375, - "step": 1423, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.997967004776001 - }, - { - "episode": 22800, - "epoch": 0.13660711075960744, - "loss/policy_avg": 1.6550846099853516, - "lr": 9.08997955010225e-06, - "objective/entropy": -237.4461669921875, - "objective/kl": 24.241506576538086, - "objective/non_score_reward": -1.2120752334594727, - "objective/rlhf_reward": -3.5227884388267228, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 2.2821950912475586, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.623046875, - "step": 1424, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0027098655700684 - }, - { - "episode": 22816, - "epoch": 0.136702975398737, - "loss/policy_avg": 0.04801030457019806, - "lr": 9.089340490797546e-06, - "objective/entropy": -192.99920654296875, - "objective/kl": 42.85979461669922, - "objective/non_score_reward": -2.1429896354675293, - "objective/rlhf_reward": -7.0910067586258645, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 7.908871650695801, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.640625, - "step": 1425, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9976098537445068 - }, - { - "episode": 22832, - "epoch": 0.13679884003786652, - "loss/policy_avg": 0.773149847984314, - "lr": 9.088701431492843e-06, - "objective/entropy": -204.166015625, - "objective/kl": 33.09935760498047, - "objective/non_score_reward": -1.6549677848815918, - "objective/rlhf_reward": -4.219871020317077, - "objective/scores": 0.6, - "policy/approxkl_avg": 20.173633575439453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.69921875, - "step": 1426, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9965462684631348 - }, - { - "episode": 22848, - "epoch": 0.13689470467699608, - "loss/policy_avg": 0.1709783971309662, - "lr": 9.08806237218814e-06, - "objective/entropy": -294.40911865234375, - "objective/kl": 33.224021911621094, - "objective/non_score_reward": -1.6612012386322021, - "objective/rlhf_reward": -5.244805312156677, - "objective/scores": 0.35, - "policy/approxkl_avg": 8.916924476623535, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.62109375, - "step": 1427, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9989229440689087 - }, - { - "episode": 22864, - "epoch": 0.13699056931612563, - "loss/policy_avg": -0.012170173227787018, - "lr": 9.087423312883437e-06, - "objective/entropy": -246.09201049804688, - "objective/kl": 36.022274017333984, - "objective/non_score_reward": -1.8011138439178467, - "objective/rlhf_reward": -5.862819781809478, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 1.0541167259216309, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.646484375, - "step": 1428, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0022614002227783 - }, - { - "episode": 22880, - "epoch": 0.1370864339552552, - "loss/policy_avg": -0.04561644792556763, - "lr": 9.086784253578734e-06, - "objective/entropy": -243.6079559326172, - "objective/kl": 26.140531539916992, - "objective/non_score_reward": -1.3070266246795654, - "objective/rlhf_reward": -3.7123345372998084, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 66.2858657836914, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6484375, - "step": 1429, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9988305568695068 - }, - { - "episode": 22896, - "epoch": 0.13718229859438472, - "loss/policy_avg": -0.43349897861480713, - "lr": 9.086145194274029e-06, - "objective/entropy": -226.036376953125, - "objective/kl": 34.60681915283203, - "objective/non_score_reward": -1.730340838432312, - "objective/rlhf_reward": -5.440410855229258, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 9.1111478805542, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.591796875, - "step": 1430, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0083601474761963 - }, - { - "episode": 22912, - "epoch": 0.13727816323351427, - "loss/policy_avg": -0.0003149360418319702, - "lr": 9.085506134969326e-06, - "objective/entropy": -238.45263671875, - "objective/kl": 36.35710144042969, - "objective/non_score_reward": -1.817854881286621, - "objective/rlhf_reward": -5.755647504123386, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 9.317008018493652, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.525390625, - "step": 1431, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985694885253906 - }, - { - "episode": 22928, - "epoch": 0.13737402787264383, - "loss/policy_avg": 0.06102012097835541, - "lr": 9.084867075664623e-06, - "objective/entropy": -249.6441650390625, - "objective/kl": 30.982303619384766, - "objective/non_score_reward": -1.5491151809692383, - "objective/rlhf_reward": -6.1964609026908875, - "objective/scores": 0.0, - "policy/approxkl_avg": 1.687659502029419, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.580078125, - "step": 1432, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0023670196533203 - }, - { - "episode": 22944, - "epoch": 0.13746989251177338, - "loss/policy_avg": 0.4118673503398895, - "lr": 9.08422801635992e-06, - "objective/entropy": -208.8826446533203, - "objective/kl": 30.2443790435791, - "objective/non_score_reward": -1.512218952178955, - "objective/rlhf_reward": -3.926169695631538, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 1.5902602672576904, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.60546875, - "step": 1433, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0101170539855957 - }, - { - "episode": 22960, - "epoch": 0.1375657571509029, - "loss/policy_avg": 0.48478835821151733, - "lr": 9.083588957055215e-06, - "objective/entropy": -193.95437622070312, - "objective/kl": 32.08013153076172, - "objective/non_score_reward": -1.6040066480636597, - "objective/rlhf_reward": -2.0160264730453488, - "objective/scores": 1.1, - "policy/approxkl_avg": 3.34360408782959, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.658203125, - "step": 1434, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999372959136963 - }, - { - "episode": 22976, - "epoch": 0.13766162179003247, - "loss/policy_avg": -0.11271242052316666, - "lr": 9.082949897750512e-06, - "objective/entropy": -230.10296630859375, - "objective/kl": 24.382526397705078, - "objective/non_score_reward": -1.2191263437271118, - "objective/rlhf_reward": -4.876505374908447, - "objective/scores": 0.0, - "policy/approxkl_avg": 127.57635498046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.701171875, - "step": 1435, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.004199504852295 - }, - { - "episode": 22992, - "epoch": 0.13775748642916202, - "loss/policy_avg": 0.06364642083644867, - "lr": 9.082310838445809e-06, - "objective/entropy": -229.41696166992188, - "objective/kl": 41.24948501586914, - "objective/non_score_reward": -2.062474489212036, - "objective/rlhf_reward": -6.908262303381591, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 5.2524919509887695, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.732421875, - "step": 1436, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999943733215332 - }, - { - "episode": 23008, - "epoch": 0.13785335106829158, - "loss/policy_avg": 0.48797979950904846, - "lr": 9.081671779141104e-06, - "objective/entropy": -276.9367370605469, - "objective/kl": 39.9646110534668, - "objective/non_score_reward": -1.9982305765151978, - "objective/rlhf_reward": -6.331062798917877, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 0.5203732252120972, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.705078125, - "step": 1437, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0025830268859863 - }, - { - "episode": 23024, - "epoch": 0.1379492157074211, - "loss/policy_avg": -0.24298250675201416, - "lr": 9.0810327198364e-06, - "objective/entropy": -105.76115417480469, - "objective/kl": 33.14936447143555, - "objective/non_score_reward": -1.657468318939209, - "objective/rlhf_reward": -5.0257530546823315, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 8.427331924438477, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.57421875, - "step": 1438, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.009620428085327 - }, - { - "episode": 23040, - "epoch": 0.13804508034655066, - "loss/policy_avg": 0.11671873927116394, - "lr": 9.080393660531698e-06, - "objective/entropy": -207.38742065429688, - "objective/kl": 25.86302375793457, - "objective/non_score_reward": -1.2931511402130127, - "objective/rlhf_reward": -3.7487728192406573, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 2.233325481414795, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.56640625, - "step": 1439, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.003443717956543 - }, - { - "episode": 23056, - "epoch": 0.13814094498568022, - "loss/policy_avg": 0.32801347970962524, - "lr": 9.079754601226994e-06, - "objective/entropy": -256.9577941894531, - "objective/kl": 46.250244140625, - "objective/non_score_reward": -2.312512159347534, - "objective/rlhf_reward": -7.30263740845197, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 13.493009567260742, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.69921875, - "step": 1440, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9997389316558838 - }, - { - "episode": 23072, - "epoch": 0.13823680962480978, - "loss/policy_avg": 0.11888322979211807, - "lr": 9.079115541922291e-06, - "objective/entropy": -188.8265380859375, - "objective/kl": 36.61035919189453, - "objective/non_score_reward": -1.8305180072784424, - "objective/rlhf_reward": -5.717951808039265, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 4.184604644775391, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5703125, - "step": 1441, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9999384880065918 - }, - { - "episode": 23088, - "epoch": 0.13833267426393933, - "loss/policy_avg": 1.4782209396362305, - "lr": 9.078476482617588e-06, - "objective/entropy": -250.94830322265625, - "objective/kl": 32.46335983276367, - "objective/non_score_reward": -1.6231679916381836, - "objective/rlhf_reward": -5.04207394561325, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 12.02247428894043, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.71484375, - "step": 1442, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9968842267990112 - }, - { - "episode": 23104, - "epoch": 0.13842853890306886, - "loss/policy_avg": 0.3007156252861023, - "lr": 9.077837423312883e-06, - "objective/entropy": -287.5181884765625, - "objective/kl": 36.19750213623047, - "objective/non_score_reward": -1.8098750114440918, - "objective/rlhf_reward": -2.839500284194946, - "objective/scores": 1.1, - "policy/approxkl_avg": 74.40210723876953, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.638671875, - "step": 1443, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0000827312469482 - }, - { - "episode": 23120, - "epoch": 0.13852440354219842, - "loss/policy_avg": 0.04818664491176605, - "lr": 9.07719836400818e-06, - "objective/entropy": -213.17276000976562, - "objective/kl": 29.73092269897461, - "objective/non_score_reward": -1.4865461587905884, - "objective/rlhf_reward": -4.284325008810149, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 7.239911079406738, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.703125, - "step": 1444, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0012645721435547 - }, - { - "episode": 23136, - "epoch": 0.13862026818132797, - "loss/policy_avg": -0.07083216309547424, - "lr": 9.076559304703477e-06, - "objective/entropy": -245.51406860351562, - "objective/kl": 30.533039093017578, - "objective/non_score_reward": -1.5266518592834473, - "objective/rlhf_reward": -4.706607258319854, - "objective/scores": 0.35, - "policy/approxkl_avg": 18.655948638916016, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.76953125, - "step": 1445, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9989795684814453 - }, - { - "episode": 23152, - "epoch": 0.13871613282045753, - "loss/policy_avg": -0.036476410925388336, - "lr": 9.075920245398774e-06, - "objective/entropy": -170.21502685546875, - "objective/kl": 32.540916442871094, - "objective/non_score_reward": -1.62704598903656, - "objective/rlhf_reward": -5.182671103507204, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 32.26573181152344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.65234375, - "step": 1446, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.000288963317871 - }, - { - "episode": 23168, - "epoch": 0.13881199745958706, - "loss/policy_avg": -0.017804868519306183, - "lr": 9.075281186094071e-06, - "objective/entropy": -137.2032470703125, - "objective/kl": 43.56850814819336, - "objective/non_score_reward": -2.1784255504608154, - "objective/rlhf_reward": -7.263103942485198, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 6.8924241065979, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.701171875, - "step": 1447, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0064001083374023 - }, - { - "episode": 23184, - "epoch": 0.1389078620987166, - "loss/policy_avg": -0.14877469837665558, - "lr": 9.074642126789366e-06, - "objective/entropy": -225.8348388671875, - "objective/kl": 37.99517059326172, - "objective/non_score_reward": -1.8997586965560913, - "objective/rlhf_reward": -5.476328434721504, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 12.023405075073242, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.52734375, - "step": 1448, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.999727725982666 - }, - { - "episode": 23200, - "epoch": 0.13900372673784617, - "loss/policy_avg": 0.5529218316078186, - "lr": 9.074003067484663e-06, - "objective/entropy": -227.67355346679688, - "objective/kl": 37.310272216796875, - "objective/non_score_reward": -1.865513801574707, - "objective/rlhf_reward": -7.462055325508118, - "objective/scores": 0.0, - "policy/approxkl_avg": 4.481158256530762, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.63671875, - "step": 1449, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9991602897644043 - }, - { - "episode": 23216, - "epoch": 0.13909959137697572, - "loss/policy_avg": 0.12435504049062729, - "lr": 9.07336400817996e-06, - "objective/entropy": -278.33416748046875, - "objective/kl": 31.698009490966797, - "objective/non_score_reward": -1.5849003791809082, - "objective/rlhf_reward": -4.216895284430061, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 7.932787895202637, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.560546875, - "step": 1450, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9972891807556152 - }, - { - "episode": 23232, - "epoch": 0.13919545601610525, - "loss/policy_avg": 0.281582236289978, - "lr": 9.072724948875257e-06, - "objective/entropy": -217.79067993164062, - "objective/kl": 24.10039520263672, - "objective/non_score_reward": -1.2050197124481201, - "objective/rlhf_reward": -3.2159590459504894, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 10.788034439086914, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015625, - "step": 1451, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0001490116119385 - }, - { - "episode": 23248, - "epoch": 0.1392913206552348, - "loss/policy_avg": 0.29366064071655273, - "lr": 9.072085889570554e-06, - "objective/entropy": -186.27256774902344, - "objective/kl": 31.6138858795166, - "objective/non_score_reward": -1.580694317817688, - "objective/rlhf_reward": -4.981141617804198, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 34.03179931640625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6328125, - "step": 1452, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.998284101486206 - }, - { - "episode": 23264, - "epoch": 0.13938718529436436, - "loss/policy_avg": -0.07383158057928085, - "lr": 9.07144683026585e-06, - "objective/entropy": -212.43728637695312, - "objective/kl": 32.51192092895508, - "objective/non_score_reward": -1.625596046447754, - "objective/rlhf_reward": -4.84052479785739, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 12.179756164550781, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.56640625, - "step": 1453, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0035171508789062 - }, - { - "episode": 23280, - "epoch": 0.13948304993349392, - "loss/policy_avg": 0.3167204260826111, - "lr": 9.070807770961146e-06, - "objective/entropy": -238.9555206298828, - "objective/kl": 32.64189529418945, - "objective/non_score_reward": -1.6320947408676147, - "objective/rlhf_reward": -4.128379082679748, - "objective/scores": 0.6, - "policy/approxkl_avg": 23.131351470947266, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.666015625, - "step": 1454, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998004674911499 - }, - { - "episode": 23296, - "epoch": 0.13957891457262345, - "loss/policy_avg": 0.23313897848129272, - "lr": 9.070168711656443e-06, - "objective/entropy": -61.311492919921875, - "objective/kl": 33.116355895996094, - "objective/non_score_reward": -1.6558178663253784, - "objective/rlhf_reward": -5.297758612662477, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 35.88987350463867, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.880859375, - "step": 1455, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.9964382648468018 - }, - { - "episode": 23312, - "epoch": 0.139674779211753, - "loss/policy_avg": 0.24700571596622467, - "lr": 9.069529652351738e-06, - "objective/entropy": -255.76536560058594, - "objective/kl": 32.391082763671875, - "objective/non_score_reward": -1.6195542812347412, - "objective/rlhf_reward": -5.136581590681701, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 18.76715087890625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.658203125, - "step": 1456, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9971431493759155 - }, - { - "episode": 23328, - "epoch": 0.13977064385088256, - "loss/policy_avg": 0.15339264273643494, - "lr": 9.068890593047035e-06, - "objective/entropy": -262.24359130859375, - "objective/kl": 44.425445556640625, - "objective/non_score_reward": -2.2212722301483154, - "objective/rlhf_reward": -7.328829615321711, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 10.498491287231445, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.578125, - "step": 1457, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.99570894241333 - }, - { - "episode": 23344, - "epoch": 0.13986650849001211, - "loss/policy_avg": 0.2555537521839142, - "lr": 9.068251533742332e-06, - "objective/entropy": -233.40225219726562, - "objective/kl": 32.24791717529297, - "objective/non_score_reward": -1.6123958826065063, - "objective/rlhf_reward": -4.998985271067962, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 26.18222999572754, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.591796875, - "step": 1458, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9968748092651367 - }, - { - "episode": 23360, - "epoch": 0.13996237312914164, - "loss/policy_avg": -0.060967281460762024, - "lr": 9.067612474437628e-06, - "objective/entropy": -264.13665771484375, - "objective/kl": 40.685516357421875, - "objective/non_score_reward": -2.034276008605957, - "objective/rlhf_reward": -6.014397802130256, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 3.826961040496826, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7109375, - "step": 1459, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0120983123779297 - }, - { - "episode": 23376, - "epoch": 0.1400582377682712, - "loss/policy_avg": 0.20050451159477234, - "lr": 9.066973415132925e-06, - "objective/entropy": -100.12745666503906, - "objective/kl": 37.77911376953125, - "objective/non_score_reward": -1.88895583152771, - "objective/rlhf_reward": -6.177221098033291, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 11.450329780578613, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.84375, - "step": 1460, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0007660388946533 - }, - { - "episode": 23392, - "epoch": 0.14015410240740075, - "loss/policy_avg": 0.27806586027145386, - "lr": 9.06633435582822e-06, - "objective/entropy": -227.0691375732422, - "objective/kl": 31.303085327148438, - "objective/non_score_reward": -1.5651543140411377, - "objective/rlhf_reward": -4.935104284316225, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 1.6722452640533447, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.544921875, - "step": 1461, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.002023696899414 - }, - { - "episode": 23408, - "epoch": 0.1402499670465303, - "loss/policy_avg": 0.4921458065509796, - "lr": 9.065695296523517e-06, - "objective/entropy": -282.57427978515625, - "objective/kl": 26.013471603393555, - "objective/non_score_reward": -1.3006736040115356, - "objective/rlhf_reward": -2.2789752229463787, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 4.31304931640625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.73046875, - "step": 1462, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9963445663452148 - }, - { - "episode": 23424, - "epoch": 0.14034583168565984, - "loss/policy_avg": 0.3357080817222595, - "lr": 9.065056237218814e-06, - "objective/entropy": -292.97235107421875, - "objective/kl": 25.093395233154297, - "objective/non_score_reward": -1.2546697854995728, - "objective/rlhf_reward": -5.018679141998291, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.99601936340332, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.73046875, - "step": 1463, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9978545904159546 - }, - { - "episode": 23440, - "epoch": 0.1404416963247894, - "loss/policy_avg": 0.2813834846019745, - "lr": 9.064417177914111e-06, - "objective/entropy": -208.256103515625, - "objective/kl": 36.03406524658203, - "objective/non_score_reward": -1.8017032146453857, - "objective/rlhf_reward": -5.47347940603892, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 30.506061553955078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.73828125, - "step": 1464, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.998169183731079 - }, - { - "episode": 23456, - "epoch": 0.14053756096391895, - "loss/policy_avg": 0.2761915922164917, - "lr": 9.063778118609408e-06, - "objective/entropy": -172.63931274414062, - "objective/kl": 33.94431686401367, - "objective/non_score_reward": -1.6972159147262573, - "objective/rlhf_reward": -5.365031678875057, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 4.0666117668151855, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.662109375, - "step": 1465, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994304180145264 - }, - { - "episode": 23472, - "epoch": 0.1406334256030485, - "loss/policy_avg": 0.667277455329895, - "lr": 9.063139059304705e-06, - "objective/entropy": -179.61021423339844, - "objective/kl": 30.94757843017578, - "objective/non_score_reward": -1.5473790168762207, - "objective/rlhf_reward": -4.8109142566598475, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 1.069715976715088, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.662109375, - "step": 1466, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0003573894500732 - }, - { - "episode": 23488, - "epoch": 0.14072929024217803, - "loss/policy_avg": 0.03628428280353546, - "lr": 9.0625e-06, - "objective/entropy": -235.0047607421875, - "objective/kl": 34.63279342651367, - "objective/non_score_reward": -1.7316396236419678, - "objective/rlhf_reward": -4.52655873298645, - "objective/scores": 0.6, - "policy/approxkl_avg": 58.85393524169922, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.734375, - "step": 1467, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000277280807495 - }, - { - "episode": 23504, - "epoch": 0.1408251548813076, - "loss/policy_avg": 0.2930186688899994, - "lr": 9.061860940695297e-06, - "objective/entropy": -311.19976806640625, - "objective/kl": 39.07278060913086, - "objective/non_score_reward": -1.953639030456543, - "objective/rlhf_reward": -6.258296697345331, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 5.577837944030762, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.654296875, - "step": 1468, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985158443450928 - }, - { - "episode": 23520, - "epoch": 0.14092101952043715, - "loss/policy_avg": -0.16989761590957642, - "lr": 9.061221881390594e-06, - "objective/entropy": -221.3096923828125, - "objective/kl": 34.3406982421875, - "objective/non_score_reward": -1.7170348167419434, - "objective/rlhf_reward": -4.468139624595642, - "objective/scores": 0.6, - "policy/approxkl_avg": 24.528539657592773, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.650390625, - "step": 1469, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.003714084625244 - }, - { - "episode": 23536, - "epoch": 0.1410168841595667, - "loss/policy_avg": 0.5539761781692505, - "lr": 9.06058282208589e-06, - "objective/entropy": -208.52297973632812, - "objective/kl": 31.98680877685547, - "objective/non_score_reward": -1.5993404388427734, - "objective/rlhf_reward": -5.038111769889278, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.0958806276321411, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.583984375, - "step": 1470, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0022308826446533 - }, - { - "episode": 23552, - "epoch": 0.14111274879869623, - "loss/policy_avg": 0.18784965574741364, - "lr": 9.059943762781188e-06, - "objective/entropy": -209.775146484375, - "objective/kl": 32.73426818847656, - "objective/non_score_reward": -1.6367132663726807, - "objective/rlhf_reward": -5.205217769652037, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 14.739479064941406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.830078125, - "step": 1471, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9984816312789917 - }, - { - "episode": 23568, - "epoch": 0.1412086134378258, - "loss/policy_avg": -0.015954041853547096, - "lr": 9.059304703476484e-06, - "objective/entropy": -266.16412353515625, - "objective/kl": 41.6230354309082, - "objective/non_score_reward": -2.0811514854431152, - "objective/rlhf_reward": -6.965356790755672, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.9907431602478027, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.751953125, - "step": 1472, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9974491596221924 - }, - { - "episode": 23584, - "epoch": 0.14130447807695534, - "loss/policy_avg": 0.38890203833580017, - "lr": 9.05866564417178e-06, - "objective/entropy": -276.6087951660156, - "objective/kl": 31.15995216369629, - "objective/non_score_reward": -1.5579975843429565, - "objective/rlhf_reward": -4.407161588939737, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 27.55563735961914, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.587890625, - "step": 1473, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9981493949890137 - }, - { - "episode": 23600, - "epoch": 0.1414003427160849, - "loss/policy_avg": 0.47803691029548645, - "lr": 9.058026584867077e-06, - "objective/entropy": -212.98114013671875, - "objective/kl": 45.53947067260742, - "objective/non_score_reward": -2.2769737243652344, - "objective/rlhf_reward": -7.782381329566164, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 18.62641716003418, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.689453125, - "step": 1474, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9991447925567627 - }, - { - "episode": 23616, - "epoch": 0.14149620735521443, - "loss/policy_avg": -0.034300077706575394, - "lr": 9.057387525562373e-06, - "objective/entropy": -228.06358337402344, - "objective/kl": 31.04609489440918, - "objective/non_score_reward": -1.5523046255111694, - "objective/rlhf_reward": -4.883705649405641, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 2.5281810760498047, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.634765625, - "step": 1475, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0011422634124756 - }, - { - "episode": 23632, - "epoch": 0.14159207199434398, - "loss/policy_avg": 0.021888693794608116, - "lr": 9.05674846625767e-06, - "objective/entropy": -190.63259887695312, - "objective/kl": 25.703638076782227, - "objective/non_score_reward": -1.2851818799972534, - "objective/rlhf_reward": -3.4788681320553883, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.3920494318008423, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.521484375, - "step": 1476, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0001204013824463 - }, - { - "episode": 23648, - "epoch": 0.14168793663347354, - "loss/policy_avg": 0.5509345531463623, - "lr": 9.056109406952967e-06, - "objective/entropy": -259.56756591796875, - "objective/kl": 34.98920440673828, - "objective/non_score_reward": -1.749460220336914, - "objective/rlhf_reward": -5.335981493414031, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 10.9700927734375, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.57421875, - "step": 1477, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0011067390441895 - }, - { - "episode": 23664, - "epoch": 0.1417838012726031, - "loss/policy_avg": -0.3546954095363617, - "lr": 9.055470347648262e-06, - "objective/entropy": -182.73776245117188, - "objective/kl": 40.87359619140625, - "objective/non_score_reward": -2.043679714202881, - "objective/rlhf_reward": -5.251000319362852, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 18.994150161743164, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.83203125, - "step": 1478, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9988384246826172 - }, - { - "episode": 23680, - "epoch": 0.14187966591173262, - "loss/policy_avg": 0.18590596318244934, - "lr": 9.05483128834356e-06, - "objective/entropy": -204.18365478515625, - "objective/kl": 27.81656837463379, - "objective/non_score_reward": -1.3908284902572632, - "objective/rlhf_reward": -4.204064035151882, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.7427480220794678, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.595703125, - "step": 1479, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998271465301514 - }, - { - "episode": 23696, - "epoch": 0.14197553055086218, - "loss/policy_avg": 2.363548755645752, - "lr": 9.054192229038854e-06, - "objective/entropy": -259.5701599121094, - "objective/kl": 34.00105285644531, - "objective/non_score_reward": -1.7000526189804077, - "objective/rlhf_reward": -4.8527993661927535, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 24.17676544189453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.705078125, - "step": 1480, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9970941543579102 - }, - { - "episode": 23712, - "epoch": 0.14207139518999173, - "loss/policy_avg": 0.10088340193033218, - "lr": 9.053553169734151e-06, - "objective/entropy": -170.8048095703125, - "objective/kl": 33.465816497802734, - "objective/non_score_reward": -1.6732908487319946, - "objective/rlhf_reward": -5.367650661498232, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 3.380504608154297, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.56640625, - "step": 1481, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998365640640259 - }, - { - "episode": 23728, - "epoch": 0.1421672598291213, - "loss/policy_avg": 0.303548663854599, - "lr": 9.052914110429448e-06, - "objective/entropy": -291.6600341796875, - "objective/kl": 36.511512756347656, - "objective/non_score_reward": -1.8255757093429565, - "objective/rlhf_reward": -5.878470738132563, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 22.745920181274414, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5703125, - "step": 1482, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9991157054901123 - }, - { - "episode": 23744, - "epoch": 0.14226312446825082, - "loss/policy_avg": 0.6809051632881165, - "lr": 9.052275051124745e-06, - "objective/entropy": -224.0225830078125, - "objective/kl": 27.76144790649414, - "objective/non_score_reward": -1.3880724906921387, - "objective/rlhf_reward": -4.036518180164036, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 1.4273369312286377, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.64453125, - "step": 1483, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000868082046509 - }, - { - "episode": 23760, - "epoch": 0.14235898910738037, - "loss/policy_avg": 0.5665885210037231, - "lr": 9.051635991820042e-06, - "objective/entropy": -210.581298828125, - "objective/kl": 42.927467346191406, - "objective/non_score_reward": -2.1463735103607178, - "objective/rlhf_reward": -7.134895424456939, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 106.22406005859375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.75, - "step": 1484, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9954371452331543 - }, - { - "episode": 23776, - "epoch": 0.14245485374650993, - "loss/policy_avg": 0.09896639734506607, - "lr": 9.050996932515339e-06, - "objective/entropy": -248.680419921875, - "objective/kl": 31.273435592651367, - "objective/non_score_reward": -1.5636719465255737, - "objective/rlhf_reward": -4.830855686863033, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 124.75199890136719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.73046875, - "step": 1485, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998034954071045 - }, - { - "episode": 23792, - "epoch": 0.14255071838563949, - "loss/policy_avg": 0.06345228850841522, - "lr": 9.050357873210634e-06, - "objective/entropy": -207.33094787597656, - "objective/kl": 41.283634185791016, - "objective/non_score_reward": -2.0641818046569824, - "objective/rlhf_reward": -6.775774124081492, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 1.8670159578323364, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.63671875, - "step": 1486, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999525547027588 - }, - { - "episode": 23808, - "epoch": 0.142646583024769, - "loss/policy_avg": -0.08927027136087418, - "lr": 9.049718813905931e-06, - "objective/entropy": -192.79010009765625, - "objective/kl": 38.38804626464844, - "objective/non_score_reward": -1.9194023609161377, - "objective/rlhf_reward": -6.073489222590046, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 2.219352960586548, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.607421875, - "step": 1487, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.005600929260254 - }, - { - "episode": 23824, - "epoch": 0.14274244766389857, - "loss/policy_avg": 0.17106056213378906, - "lr": 9.049079754601228e-06, - "objective/entropy": -174.7689208984375, - "objective/kl": 33.021854400634766, - "objective/non_score_reward": -1.6510926485061646, - "objective/rlhf_reward": -4.871037379900614, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 1.4489188194274902, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6171875, - "step": 1488, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9988312721252441 - }, - { - "episode": 23840, - "epoch": 0.14283831230302813, - "loss/policy_avg": -0.03305444121360779, - "lr": 9.048440695296525e-06, - "objective/entropy": -201.0426788330078, - "objective/kl": 38.58399200439453, - "objective/non_score_reward": -1.9291996955871582, - "objective/rlhf_reward": -5.316798543930053, - "objective/scores": 0.6, - "policy/approxkl_avg": 23.38889503479004, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58203125, - "step": 1489, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998862385749817 - }, - { - "episode": 23856, - "epoch": 0.14293417694215768, - "loss/policy_avg": -0.13663126528263092, - "lr": 9.047801635991821e-06, - "objective/entropy": -128.63768005371094, - "objective/kl": 31.9277400970459, - "objective/non_score_reward": -1.5963871479034424, - "objective/rlhf_reward": -4.869776809009251, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 67.26014709472656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70703125, - "step": 1490, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001464366912842 - }, - { - "episode": 23872, - "epoch": 0.1430300415812872, - "loss/policy_avg": -0.10490886867046356, - "lr": 9.047162576687117e-06, - "objective/entropy": -172.6685791015625, - "objective/kl": 40.15425491333008, - "objective/non_score_reward": -2.0077128410339355, - "objective/rlhf_reward": -6.206022734912942, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 1.7667760848999023, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.630859375, - "step": 1491, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000633716583252 - }, - { - "episode": 23888, - "epoch": 0.14312590622041677, - "loss/policy_avg": -0.010392919182777405, - "lr": 9.046523517382414e-06, - "objective/entropy": -114.63517761230469, - "objective/kl": 42.928489685058594, - "objective/non_score_reward": -2.1464245319366455, - "objective/rlhf_reward": -5.6619793518793315, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 23.010705947875977, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.806640625, - "step": 1492, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.000854015350342 - }, - { - "episode": 23904, - "epoch": 0.14322177085954632, - "loss/policy_avg": 0.33309632539749146, - "lr": 9.04588445807771e-06, - "objective/entropy": -254.52764892578125, - "objective/kl": 36.419219970703125, - "objective/non_score_reward": -1.8209609985351562, - "objective/rlhf_reward": -7.2838438749313354, - "objective/scores": 0.0, - "policy/approxkl_avg": 15.836235046386719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7734375, - "step": 1493, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 2.0010738372802734 - }, - { - "episode": 23920, - "epoch": 0.14331763549867588, - "loss/policy_avg": 0.7723073959350586, - "lr": 9.045245398773007e-06, - "objective/entropy": -241.48744201660156, - "objective/kl": 45.3399658203125, - "objective/non_score_reward": -2.266998291015625, - "objective/rlhf_reward": -9.06799328327179, - "objective/scores": 0.0, - "policy/approxkl_avg": 72.61679077148438, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.62109375, - "step": 1494, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9994328022003174 - }, - { - "episode": 23936, - "epoch": 0.1434135001378054, - "loss/policy_avg": 0.12270835041999817, - "lr": 9.044606339468304e-06, - "objective/entropy": -233.93212890625, - "objective/kl": 36.94314956665039, - "objective/non_score_reward": -1.8471574783325195, - "objective/rlhf_reward": -5.726770406187163, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 18.066661834716797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7421875, - "step": 1495, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.997254729270935 - }, - { - "episode": 23952, - "epoch": 0.14350936477693496, - "loss/policy_avg": 0.17836476862430573, - "lr": 9.043967280163601e-06, - "objective/entropy": -189.3540496826172, - "objective/kl": 33.162147521972656, - "objective/non_score_reward": -1.6581075191497803, - "objective/rlhf_reward": -2.232429778575897, - "objective/scores": 1.1, - "policy/approxkl_avg": 8.499994277954102, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.693359375, - "step": 1496, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.998523473739624 - }, - { - "episode": 23968, - "epoch": 0.14360522941606452, - "loss/policy_avg": 0.02606182172894478, - "lr": 9.043328220858896e-06, - "objective/entropy": -187.52023315429688, - "objective/kl": 27.699565887451172, - "objective/non_score_reward": -1.3849782943725586, - "objective/rlhf_reward": -3.983654229846552, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 35.55757522583008, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.681640625, - "step": 1497, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000664234161377 - }, - { - "episode": 23984, - "epoch": 0.14370109405519407, - "loss/policy_avg": 0.2732602655887604, - "lr": 9.042689161554193e-06, - "objective/entropy": -221.50460815429688, - "objective/kl": 31.577119827270508, - "objective/non_score_reward": -1.5788559913635254, - "objective/rlhf_reward": -1.9154240846633908, - "objective/scores": 1.1, - "policy/approxkl_avg": 2.376906394958496, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.72265625, - "step": 1498, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000617027282715 - }, - { - "episode": 24000, - "epoch": 0.14379695869432363, - "loss/policy_avg": 0.20893090963363647, - "lr": 9.04205010224949e-06, - "objective/entropy": -245.84298706054688, - "objective/kl": 30.781171798706055, - "objective/non_score_reward": -1.5390586853027344, - "objective/rlhf_reward": -4.7776325727380335, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 19.28199577331543, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.705078125, - "step": 1499, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 1.9972145557403564 - } - ], - "logging_steps": 500, - "max_steps": 7824, - "num_input_tokens_seen": 0, - "num_train_epochs": 3.0, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": true, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 0, - "train_batch_size": null, - "trial_name": null, - "trial_params": null -}