diff --git "a/checkpoint-1500/trainer_state.json" "b/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1500/trainer_state.json" @@ -0,0 +1,27034 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "episode": 24000, + "epoch": 0.14379695869432363, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "episode": 16, + "epoch": 9.586463912954908e-05, + "loss/policy_avg": 0.015691569074988365, + "lr": 1e-05, + "objective/entropy": 136.889404296875, + "objective/kl": 13.172518730163574, + "objective/non_score_reward": -0.6586259603500366, + "objective/rlhf_reward": -1.2559016580260813, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 330.0568542480469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.75, + "step": 0, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999426007270813 + }, + { + "episode": 32, + "epoch": 0.00019172927825909816, + "loss/policy_avg": 0.021727558225393295, + "lr": 9.999360940695298e-06, + "objective/entropy": -4.705432891845703, + "objective/kl": 4.4086012840271, + "objective/non_score_reward": -0.22043009102344513, + "objective/rlhf_reward": 0.49688179692854306, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 25.247615814208984, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4375, + "step": 1, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0005669593811035 + }, + { + "episode": 48, + "epoch": 0.00028759391738864725, + "loss/policy_avg": 0.05422616004943848, + "lr": 9.998721881390595e-06, + "objective/entropy": 26.511795043945312, + "objective/kl": 10.364278793334961, + "objective/non_score_reward": -0.5182140469551086, + "objective/rlhf_reward": -0.6222579917923059, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 174.7788543701172, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6328125, + "step": 2, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001580238342285 + }, + { + "episode": 64, + "epoch": 0.0003834585565181963, + "loss/policy_avg": 0.1031150370836258, + "lr": 9.99808282208589e-06, + "objective/entropy": -6.2874298095703125, + "objective/kl": 7.10389518737793, + "objective/non_score_reward": -0.35519474744796753, + "objective/rlhf_reward": 0.24108044284523888, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 107.51742553710938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.806640625, + "step": 3, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999995231628418 + }, + { + "episode": 80, + "epoch": 0.0004793231956477454, + "loss/policy_avg": 0.020609447732567787, + "lr": 9.997443762781187e-06, + "objective/entropy": 63.54547882080078, + "objective/kl": 1.458254337310791, + "objective/non_score_reward": -0.07291271537542343, + "objective/rlhf_reward": 1.224120924828116, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 14.240117073059082, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4150390625, + "step": 4, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000563383102417 + }, + { + "episode": 96, + "epoch": 0.0005751878347772945, + "loss/policy_avg": 0.1277482807636261, + "lr": 9.996804703476484e-06, + "objective/entropy": 55.068546295166016, + "objective/kl": 8.753851890563965, + "objective/non_score_reward": -0.43769264221191406, + "objective/rlhf_reward": -0.37216834077010735, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 100.08578491210938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.447265625, + "step": 5, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999474048614502 + }, + { + "episode": 112, + "epoch": 0.0006710524739068436, + "loss/policy_avg": 0.3148539662361145, + "lr": 9.99616564417178e-06, + "objective/entropy": 21.463600158691406, + "objective/kl": 9.847577095031738, + "objective/non_score_reward": -0.4923788607120514, + "objective/rlhf_reward": -0.02210425861352272, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 82.89840698242188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619140625, + "step": 6, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998772382736206 + }, + { + "episode": 128, + "epoch": 0.0007669171130363926, + "loss/policy_avg": -9.760260581970215e-06, + "lr": 9.995526584867077e-06, + "objective/entropy": 43.514984130859375, + "objective/kl": 6.468422889709473, + "objective/non_score_reward": -0.3234211802482605, + "objective/rlhf_reward": 0.18726797867262368, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 53.660911560058594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.595703125, + "step": 7, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0024185180664062 + }, + { + "episode": 144, + "epoch": 0.0008627817521659417, + "loss/policy_avg": 0.07420124113559723, + "lr": 9.994887525562374e-06, + "objective/entropy": 111.558837890625, + "objective/kl": 5.765064716339111, + "objective/non_score_reward": -0.2882532477378845, + "objective/rlhf_reward": 0.7943982454372089, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 38.34186935424805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4462890625, + "step": 8, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975563287734985 + }, + { + "episode": 160, + "epoch": 0.0009586463912954908, + "loss/policy_avg": 0.22252294421195984, + "lr": 9.99424846625767e-06, + "objective/entropy": 99.2086181640625, + "objective/kl": 8.770297050476074, + "objective/non_score_reward": -0.4385148584842682, + "objective/rlhf_reward": -0.35405938923358926, + "objective/scores": 0.35, + "policy/approxkl_avg": 98.07421112060547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.75, + "step": 9, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9961905479431152 + }, + { + "episode": 176, + "epoch": 0.0010545110304250398, + "loss/policy_avg": 0.05278925597667694, + "lr": 9.993609406952966e-06, + "objective/entropy": 192.25936889648438, + "objective/kl": 5.483057975769043, + "objective/non_score_reward": -0.27415287494659424, + "objective/rlhf_reward": 1.3033885151147842, + "objective/scores": 0.6, + "policy/approxkl_avg": 54.852699279785156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.73046875, + "step": 10, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0001378059387207 + }, + { + "episode": 192, + "epoch": 0.001150375669554589, + "loss/policy_avg": 0.01604432426393032, + "lr": 9.992970347648263e-06, + "objective/entropy": 91.4354476928711, + "objective/kl": 1.6482281684875488, + "objective/non_score_reward": -0.08241140842437744, + "objective/rlhf_reward": 1.1513069728358984, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 12.662862777709961, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5390625, + "step": 11, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994404315948486 + }, + { + "episode": 208, + "epoch": 0.001246240308684138, + "loss/policy_avg": 0.17367278039455414, + "lr": 9.992331288343558e-06, + "objective/entropy": 148.37680053710938, + "objective/kl": 9.977045059204102, + "objective/non_score_reward": -0.4988522529602051, + "objective/rlhf_reward": -0.4796372515880427, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 132.6361083984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4619140625, + "step": 12, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963808059692383 + }, + { + "episode": 224, + "epoch": 0.0013421049478136871, + "loss/policy_avg": -0.12138635665178299, + "lr": 9.991692229038855e-06, + "objective/entropy": -70.20156860351562, + "objective/kl": 3.8376624584198, + "objective/non_score_reward": -0.1918831169605255, + "objective/rlhf_reward": 0.6324675619602202, + "objective/scores": 0.35, + "policy/approxkl_avg": 15.127391815185547, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.505859375, + "step": 13, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.017153739929199 + }, + { + "episode": 240, + "epoch": 0.001437969586943236, + "loss/policy_avg": 0.1106414794921875, + "lr": 9.991053169734152e-06, + "objective/entropy": 129.54013061523438, + "objective/kl": 12.085613250732422, + "objective/non_score_reward": -0.6042807102203369, + "objective/rlhf_reward": -0.6837895224491755, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 178.22561645507812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5390625, + "step": 14, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999481201171875 + }, + { + "episode": 256, + "epoch": 0.0015338342260727853, + "loss/policy_avg": 0.01672934927046299, + "lr": 9.990414110429449e-06, + "objective/entropy": 177.98126220703125, + "objective/kl": 7.125063896179199, + "objective/non_score_reward": -0.3562532067298889, + "objective/rlhf_reward": -0.025012841820716947, + "objective/scores": 0.35, + "policy/approxkl_avg": 91.47238159179688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.716796875, + "step": 15, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000582218170166 + }, + { + "episode": 272, + "epoch": 0.0016296988652023342, + "loss/policy_avg": 0.14258402585983276, + "lr": 9.989775051124744e-06, + "objective/entropy": 197.2217559814453, + "objective/kl": 12.70147705078125, + "objective/non_score_reward": -0.6350738406181335, + "objective/rlhf_reward": -1.1616931343949852, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 84.26277160644531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.630859375, + "step": 16, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964921474456787 + }, + { + "episode": 288, + "epoch": 0.0017255635043318834, + "loss/policy_avg": -0.0007228106260299683, + "lr": 9.989135991820041e-06, + "objective/entropy": -9.756143569946289, + "objective/kl": 7.940765380859375, + "objective/non_score_reward": -0.3970382809638977, + "objective/rlhf_reward": -0.07238138595455501, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 42.61369323730469, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.64453125, + "step": 17, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0011234283447266 + }, + { + "episode": 304, + "epoch": 0.0018214281434614326, + "loss/policy_avg": 0.13892704248428345, + "lr": 9.988496932515338e-06, + "objective/entropy": 14.549068450927734, + "objective/kl": 9.783748626708984, + "objective/non_score_reward": -0.48918741941452026, + "objective/rlhf_reward": -0.5781475538886606, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 73.81009674072266, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.607421875, + "step": 18, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998319149017334 + }, + { + "episode": 320, + "epoch": 0.0019172927825909815, + "loss/policy_avg": 0.12347989529371262, + "lr": 9.987857873210635e-06, + "objective/entropy": 197.0328369140625, + "objective/kl": 9.07555103302002, + "objective/non_score_reward": -0.453777551651001, + "objective/rlhf_reward": -0.15325071436225013, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 74.28388214111328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5625, + "step": 19, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001952648162842 + }, + { + "episode": 336, + "epoch": 0.0020131574217205307, + "loss/policy_avg": 0.06666804850101471, + "lr": 9.987218813905932e-06, + "objective/entropy": 180.56707763671875, + "objective/kl": 10.346174240112305, + "objective/non_score_reward": -0.5173087120056152, + "objective/rlhf_reward": -0.6454025848704257, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 88.01742553710938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.595703125, + "step": 20, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9958171844482422 + }, + { + "episode": 352, + "epoch": 0.0021090220608500796, + "loss/policy_avg": 0.12632718682289124, + "lr": 9.986579754601228e-06, + "objective/entropy": 165.49900817871094, + "objective/kl": 10.707776069641113, + "objective/non_score_reward": -0.5353888273239136, + "objective/rlhf_reward": -0.7629530663169442, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 118.42108917236328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.78125, + "step": 21, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9964122772216797 + }, + { + "episode": 368, + "epoch": 0.0022048866999796286, + "loss/policy_avg": 0.012576747685670853, + "lr": 9.985940695296524e-06, + "objective/entropy": -133.83059692382812, + "objective/kl": 6.06254768371582, + "objective/non_score_reward": -0.3031274080276489, + "objective/rlhf_reward": 0.21132251183215, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 4.497255325317383, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.552734375, + "step": 22, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0017709732055664 + }, + { + "episode": 384, + "epoch": 0.002300751339109178, + "loss/policy_avg": 0.21566970646381378, + "lr": 9.98530163599182e-06, + "objective/entropy": 80.05180358886719, + "objective/kl": 18.019107818603516, + "objective/non_score_reward": -0.9009554386138916, + "objective/rlhf_reward": -2.1799896850186267, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 244.3957061767578, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.72265625, + "step": 23, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975435733795166 + }, + { + "episode": 400, + "epoch": 0.002396615978238727, + "loss/policy_avg": 0.21825431287288666, + "lr": 9.984662576687117e-06, + "objective/entropy": 22.858154296875, + "objective/kl": 7.889187812805176, + "objective/non_score_reward": -0.39445942640304565, + "objective/rlhf_reward": 0.5448686011871957, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 45.33286666870117, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.54296875, + "step": 24, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998900890350342 + }, + { + "episode": 416, + "epoch": 0.002492480617368276, + "loss/policy_avg": 0.2645857036113739, + "lr": 9.984023517382414e-06, + "objective/entropy": 37.619895935058594, + "objective/kl": 11.23090934753418, + "objective/non_score_reward": -0.5615454316139221, + "objective/rlhf_reward": 0.15381827354431143, + "objective/scores": 0.6, + "policy/approxkl_avg": 88.95787811279297, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.552734375, + "step": 25, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996498703956604 + }, + { + "episode": 432, + "epoch": 0.002588345256497825, + "loss/policy_avg": 0.04753335565328598, + "lr": 9.983384458077711e-06, + "objective/entropy": 156.34921264648438, + "objective/kl": 7.371222496032715, + "objective/non_score_reward": -0.36856111884117126, + "objective/rlhf_reward": -0.14873159292332616, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 35.437461853027344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 26, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979305267333984 + }, + { + "episode": 448, + "epoch": 0.0026842098956273742, + "loss/policy_avg": -0.010932949371635914, + "lr": 9.982745398773006e-06, + "objective/entropy": 16.393407821655273, + "objective/kl": 16.967132568359375, + "objective/non_score_reward": -0.8483567237854004, + "objective/rlhf_reward": -2.051791122465759, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 207.71142578125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.564453125, + "step": 27, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9935011863708496 + }, + { + "episode": 464, + "epoch": 0.002780074534756923, + "loss/policy_avg": 0.23893436789512634, + "lr": 9.982106339468303e-06, + "objective/entropy": 170.59136962890625, + "objective/kl": 15.129783630371094, + "objective/non_score_reward": -0.7564891576766968, + "objective/rlhf_reward": -1.469697265830591, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 135.97763061523438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.72265625, + "step": 28, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975056648254395 + }, + { + "episode": 480, + "epoch": 0.002875939173886472, + "loss/policy_avg": 0.03272615000605583, + "lr": 9.9814672801636e-06, + "objective/entropy": 6.700323104858398, + "objective/kl": 10.701581954956055, + "objective/non_score_reward": -0.5350791215896606, + "objective/rlhf_reward": -0.6897181971982564, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 63.513145446777344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.60546875, + "step": 29, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998331069946289 + }, + { + "episode": 496, + "epoch": 0.0029718038130160216, + "loss/policy_avg": 0.07188314199447632, + "lr": 9.980828220858897e-06, + "objective/entropy": -47.331199645996094, + "objective/kl": 12.874979019165039, + "objective/non_score_reward": -0.6437489986419678, + "objective/rlhf_reward": -1.1963937664903224, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 77.876220703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5390625, + "step": 30, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967491626739502 + }, + { + "episode": 512, + "epoch": 0.0030676684521455705, + "loss/policy_avg": 0.04047826677560806, + "lr": 9.980189161554194e-06, + "objective/entropy": 282.3853759765625, + "objective/kl": 9.654375076293945, + "objective/non_score_reward": -0.4827187657356262, + "objective/rlhf_reward": -0.5716251668676566, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 64.11791229248047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.89453125, + "step": 31, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997191429138184 + }, + { + "episode": 528, + "epoch": 0.0031635330912751195, + "loss/policy_avg": 0.07097287476062775, + "lr": 9.97955010224949e-06, + "objective/entropy": 116.042236328125, + "objective/kl": 14.595599174499512, + "objective/non_score_reward": -0.7297799587249756, + "objective/rlhf_reward": -0.7964137478926516, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 272.6925048828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3857421875, + "step": 32, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0027289390563965 + }, + { + "episode": 544, + "epoch": 0.0032593977304046684, + "loss/policy_avg": 0.5246497392654419, + "lr": 9.978911042944786e-06, + "objective/entropy": 8.318304061889648, + "objective/kl": 16.622827529907227, + "objective/non_score_reward": -0.831141471862793, + "objective/rlhf_reward": -1.9990529752074906, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 159.0550079345703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.703125, + "step": 33, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971305131912231 + }, + { + "episode": 560, + "epoch": 0.003355262369534218, + "loss/policy_avg": 0.20073390007019043, + "lr": 9.978271983640083e-06, + "objective/entropy": 92.97464752197266, + "objective/kl": 10.66767692565918, + "objective/non_score_reward": -0.5333837866783142, + "objective/rlhf_reward": 2.2664648383855823, + "objective/scores": 1.1, + "policy/approxkl_avg": 89.14144134521484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.544921875, + "step": 34, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000143051147461 + }, + { + "episode": 576, + "epoch": 0.0034511270086637668, + "loss/policy_avg": 0.04765152558684349, + "lr": 9.977632924335378e-06, + "objective/entropy": 149.43089294433594, + "objective/kl": 16.67333221435547, + "objective/non_score_reward": -0.8336665630340576, + "objective/rlhf_reward": -0.9346663713455201, + "objective/scores": 0.6, + "policy/approxkl_avg": 189.3590850830078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4765625, + "step": 35, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986698627471924 + }, + { + "episode": 592, + "epoch": 0.0035469916477933157, + "loss/policy_avg": 0.40008074045181274, + "lr": 9.976993865030675e-06, + "objective/entropy": 157.10501098632812, + "objective/kl": 13.927867889404297, + "objective/non_score_reward": -0.6963933706283569, + "objective/rlhf_reward": -1.406971328941685, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 121.78231811523438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.70703125, + "step": 36, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974275827407837 + }, + { + "episode": 608, + "epoch": 0.003642856286922865, + "loss/policy_avg": 0.08663024008274078, + "lr": 9.976354805725972e-06, + "objective/entropy": 47.76446533203125, + "objective/kl": 13.560833930969238, + "objective/non_score_reward": -0.6780416965484619, + "objective/rlhf_reward": -0.5894605539002753, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 43.71810531616211, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5078125, + "step": 37, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991490840911865 + }, + { + "episode": 624, + "epoch": 0.003738720926052414, + "loss/policy_avg": 0.08268876373767853, + "lr": 9.975715746421269e-06, + "objective/entropy": 192.41729736328125, + "objective/kl": 6.687016010284424, + "objective/non_score_reward": -0.3343508243560791, + "objective/rlhf_reward": 0.021846643354015427, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 67.82701873779297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.619140625, + "step": 38, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999939203262329 + }, + { + "episode": 640, + "epoch": 0.003834585565181963, + "loss/policy_avg": 0.05995899811387062, + "lr": 9.975076687116566e-06, + "objective/entropy": -98.350341796875, + "objective/kl": 9.015666961669922, + "objective/non_score_reward": -0.450783371925354, + "objective/rlhf_reward": 0.14427768908268623, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 51.733055114746094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859375, + "step": 39, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974713325500488 + }, + { + "episode": 656, + "epoch": 0.003930450204311512, + "loss/policy_avg": 0.18854951858520508, + "lr": 9.97443762781186e-06, + "objective/entropy": 141.67947387695312, + "objective/kl": 10.309185028076172, + "objective/non_score_reward": -0.5154592990875244, + "objective/rlhf_reward": -0.6618371069431306, + "objective/scores": 0.35, + "policy/approxkl_avg": 71.02857208251953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.744140625, + "step": 40, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993364810943604 + }, + { + "episode": 672, + "epoch": 0.004026314843441061, + "loss/policy_avg": 0.05062849074602127, + "lr": 9.973798568507158e-06, + "objective/entropy": -38.6858024597168, + "objective/kl": 9.445882797241211, + "objective/non_score_reward": -0.4722941517829895, + "objective/rlhf_reward": -1.8891766667366028, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.4856438636779785, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62890625, + "step": 41, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984209537506104 + }, + { + "episode": 688, + "epoch": 0.00412217948257061, + "loss/policy_avg": 0.09501229226589203, + "lr": 9.973159509202454e-06, + "objective/entropy": 17.35771942138672, + "objective/kl": 10.873266220092773, + "objective/non_score_reward": -0.5436632633209229, + "objective/rlhf_reward": -0.44131985406080876, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 98.38662719726562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6953125, + "step": 42, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995697736740112 + }, + { + "episode": 704, + "epoch": 0.004218044121700159, + "loss/policy_avg": 0.32498252391815186, + "lr": 9.972520449897751e-06, + "objective/entropy": 174.98866271972656, + "objective/kl": 11.279447555541992, + "objective/non_score_reward": -0.5639723539352417, + "objective/rlhf_reward": -0.7749369321421384, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 62.73210144042969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.552734375, + "step": 43, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007286071777344 + }, + { + "episode": 720, + "epoch": 0.004313908760829708, + "loss/policy_avg": 0.3995896577835083, + "lr": 9.971881390593048e-06, + "objective/entropy": 36.609832763671875, + "objective/kl": 19.769756317138672, + "objective/non_score_reward": -0.9884878993034363, + "objective/rlhf_reward": -2.1291227295723667, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 164.33892822265625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.708984375, + "step": 44, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9945415258407593 + }, + { + "episode": 736, + "epoch": 0.004409773399959257, + "loss/policy_avg": 0.17710548639297485, + "lr": 9.971242331288345e-06, + "objective/entropy": 93.23808288574219, + "objective/kl": 16.88797378540039, + "objective/non_score_reward": -0.8443987965583801, + "objective/rlhf_reward": -1.7157356492882831, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 54.64923858642578, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.779296875, + "step": 45, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981857538223267 + }, + { + "episode": 752, + "epoch": 0.004505638039088807, + "loss/policy_avg": 0.32767364382743835, + "lr": 9.97060327198364e-06, + "objective/entropy": 202.11843872070312, + "objective/kl": 14.050471305847168, + "objective/non_score_reward": -0.7025235295295715, + "objective/rlhf_reward": -1.484581295281572, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 76.14016723632812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 46, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997007846832275 + }, + { + "episode": 768, + "epoch": 0.004601502678218356, + "loss/policy_avg": 0.08174459636211395, + "lr": 9.969964212678937e-06, + "objective/entropy": 54.37752151489258, + "objective/kl": 15.1139497756958, + "objective/non_score_reward": -0.75569748878479, + "objective/rlhf_reward": -1.6635400888666343, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 83.4612045288086, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4296875, + "step": 47, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9972434043884277 + }, + { + "episode": 784, + "epoch": 0.004697367317347905, + "loss/policy_avg": 0.03365965187549591, + "lr": 9.969325153374234e-06, + "objective/entropy": 85.39935302734375, + "objective/kl": 13.452342987060547, + "objective/non_score_reward": -0.6726170778274536, + "objective/rlhf_reward": -0.74305723138326, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 61.629390716552734, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.572265625, + "step": 48, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998470544815063 + }, + { + "episode": 800, + "epoch": 0.004793231956477454, + "loss/policy_avg": 0.009335246868431568, + "lr": 9.968686094069531e-06, + "objective/entropy": 288.22564697265625, + "objective/kl": 19.127742767333984, + "objective/non_score_reward": -0.9563871026039124, + "objective/rlhf_reward": -0.9018295153391089, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 176.43731689453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.892578125, + "step": 49, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9936624765396118 + }, + { + "episode": 816, + "epoch": 0.004889096595607003, + "loss/policy_avg": 0.13336139917373657, + "lr": 9.968047034764828e-06, + "objective/entropy": -38.686851501464844, + "objective/kl": 18.06523895263672, + "objective/non_score_reward": -0.9032620191574097, + "objective/rlhf_reward": -2.1320952503041024, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 179.73486328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65625, + "step": 50, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996016263961792 + }, + { + "episode": 832, + "epoch": 0.004984961234736552, + "loss/policy_avg": 0.09758515655994415, + "lr": 9.967407975460123e-06, + "objective/entropy": -32.55284881591797, + "objective/kl": 10.72513198852539, + "objective/non_score_reward": -0.5362565517425537, + "objective/rlhf_reward": -0.721194286544887, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 44.48727798461914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.736328125, + "step": 51, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976041316986084 + }, + { + "episode": 848, + "epoch": 0.005080825873866101, + "loss/policy_avg": 0.5202991366386414, + "lr": 9.96676891615542e-06, + "objective/entropy": 45.2802734375, + "objective/kl": 16.129152297973633, + "objective/non_score_reward": -0.8064576387405396, + "objective/rlhf_reward": -1.2784193260239918, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 124.33740234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.623046875, + "step": 52, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978928565979004 + }, + { + "episode": 864, + "epoch": 0.00517669051299565, + "loss/policy_avg": 0.28677505254745483, + "lr": 9.966129856850717e-06, + "objective/entropy": -76.81179809570312, + "objective/kl": 15.223251342773438, + "objective/non_score_reward": -0.761162519454956, + "objective/rlhf_reward": -1.5288782207094989, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 69.77767944335938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7890625, + "step": 53, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999229907989502 + }, + { + "episode": 880, + "epoch": 0.0052725551521251995, + "loss/policy_avg": 0.20859162509441376, + "lr": 9.965490797546014e-06, + "objective/entropy": -21.344478607177734, + "objective/kl": 10.70494556427002, + "objective/non_score_reward": -0.535247266292572, + "objective/rlhf_reward": -0.7623869264997064, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 98.75808715820312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.56640625, + "step": 54, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975996017456055 + }, + { + "episode": 896, + "epoch": 0.0053684197912547485, + "loss/policy_avg": 1.2579694986343384, + "lr": 9.96485173824131e-06, + "objective/entropy": 164.7299346923828, + "objective/kl": 18.096805572509766, + "objective/non_score_reward": -0.9048402309417725, + "objective/rlhf_reward": -2.0152409709134873, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 95.78445434570312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.736328125, + "step": 55, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9966285228729248 + }, + { + "episode": 912, + "epoch": 0.0054642844303842975, + "loss/policy_avg": 0.3564913868904114, + "lr": 9.964212678936606e-06, + "objective/entropy": 85.46858215332031, + "objective/kl": 17.930484771728516, + "objective/non_score_reward": -0.89652419090271, + "objective/rlhf_reward": -1.4633905313172677, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 79.41477966308594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4091796875, + "step": 56, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984946250915527 + }, + { + "episode": 928, + "epoch": 0.005560149069513846, + "loss/policy_avg": 0.03960660099983215, + "lr": 9.963573619631903e-06, + "objective/entropy": 205.954833984375, + "objective/kl": 17.15917205810547, + "objective/non_score_reward": -0.8579585552215576, + "objective/rlhf_reward": -1.3091281972089148, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 23.591196060180664, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.791015625, + "step": 57, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997645378112793 + }, + { + "episode": 944, + "epoch": 0.005656013708643395, + "loss/policy_avg": -0.00983378104865551, + "lr": 9.9629345603272e-06, + "objective/entropy": -1.1022186279296875, + "objective/kl": 16.26142692565918, + "objective/non_score_reward": -0.8130713105201721, + "objective/rlhf_reward": 1.1477148175239567, + "objective/scores": 1.1, + "policy/approxkl_avg": 81.65092468261719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.552734375, + "step": 58, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99957275390625 + }, + { + "episode": 960, + "epoch": 0.005751878347772944, + "loss/policy_avg": 0.32060182094573975, + "lr": 9.962295501022495e-06, + "objective/entropy": 48.09014892578125, + "objective/kl": 7.438636302947998, + "objective/non_score_reward": -0.3719318211078644, + "objective/rlhf_reward": 0.6349789739391469, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.77626895904541, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.822265625, + "step": 59, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.008031129837036 + }, + { + "episode": 976, + "epoch": 0.005847742986902493, + "loss/policy_avg": 0.2516993582248688, + "lr": 9.961656441717792e-06, + "objective/entropy": -46.64883804321289, + "objective/kl": 19.601835250854492, + "objective/non_score_reward": -0.9800918102264404, + "objective/rlhf_reward": -2.594854134946985, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 181.5974578857422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.609375, + "step": 60, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988071918487549 + }, + { + "episode": 992, + "epoch": 0.005943607626032043, + "loss/policy_avg": 0.1109720841050148, + "lr": 9.961017382413088e-06, + "objective/entropy": 97.6422348022461, + "objective/kl": 13.844486236572266, + "objective/non_score_reward": -0.692224383354187, + "objective/rlhf_reward": -1.2126380791335847, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 96.34603118896484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.583984375, + "step": 61, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974918365478516 + }, + { + "episode": 1008, + "epoch": 0.006039472265161592, + "loss/policy_avg": -0.05115126073360443, + "lr": 9.960378323108385e-06, + "objective/entropy": 34.42061996459961, + "objective/kl": 14.079090118408203, + "objective/non_score_reward": -0.7039545774459839, + "objective/rlhf_reward": -1.4565682944997977, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 49.87873840332031, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.677734375, + "step": 62, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982357025146484 + }, + { + "episode": 1024, + "epoch": 0.006135336904291141, + "loss/policy_avg": 0.22280101478099823, + "lr": 9.959739263803682e-06, + "objective/entropy": -24.89067840576172, + "objective/kl": 19.501176834106445, + "objective/non_score_reward": -0.9750589728355408, + "objective/rlhf_reward": -2.4496376319841, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 243.47512817382812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.888671875, + "step": 63, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999916911125183 + }, + { + "episode": 1040, + "epoch": 0.00623120154342069, + "loss/policy_avg": 0.36840492486953735, + "lr": 9.959100204498979e-06, + "objective/entropy": 134.6929931640625, + "objective/kl": 22.332670211791992, + "objective/non_score_reward": -1.1166335344314575, + "objective/rlhf_reward": -2.641705389293741, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 136.65045166015625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 64, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981276988983154 + }, + { + "episode": 1056, + "epoch": 0.006327066182550239, + "loss/policy_avg": 0.09098342061042786, + "lr": 9.958461145194274e-06, + "objective/entropy": -26.864063262939453, + "objective/kl": 13.052759170532227, + "objective/non_score_reward": -0.6526379585266113, + "objective/rlhf_reward": -0.7857228770580997, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 62.885929107666016, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.603515625, + "step": 65, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997183084487915 + }, + { + "episode": 1072, + "epoch": 0.006422930821679788, + "loss/policy_avg": 0.27086368203163147, + "lr": 9.957822085889571e-06, + "objective/entropy": -58.01667404174805, + "objective/kl": 16.48623275756836, + "objective/non_score_reward": -0.8243115544319153, + "objective/rlhf_reward": -1.635386770189391, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 153.92050170898438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.583984375, + "step": 66, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0005505084991455 + }, + { + "episode": 1088, + "epoch": 0.006518795460809337, + "loss/policy_avg": 1.2388324737548828, + "lr": 9.957183026584868e-06, + "objective/entropy": 99.91399383544922, + "objective/kl": 21.524110794067383, + "objective/non_score_reward": -1.0762056112289429, + "objective/rlhf_reward": -2.6429626993542774, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 170.69760131835938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.64453125, + "step": 67, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9946773052215576 + }, + { + "episode": 1104, + "epoch": 0.006614660099938887, + "loss/policy_avg": 0.330521821975708, + "lr": 9.956543967280165e-06, + "objective/entropy": -76.99481201171875, + "objective/kl": 15.58948802947998, + "objective/non_score_reward": -0.7794743776321411, + "objective/rlhf_reward": -1.7178976856172086, + "objective/scores": 0.35, + "policy/approxkl_avg": 218.45574951171875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.64453125, + "step": 68, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9997715950012207 + }, + { + "episode": 1120, + "epoch": 0.006710524739068436, + "loss/policy_avg": 0.11920663714408875, + "lr": 9.955904907975462e-06, + "objective/entropy": 70.55160522460938, + "objective/kl": 20.134777069091797, + "objective/non_score_reward": -1.0067389011383057, + "objective/rlhf_reward": -2.6853197722727353, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 62.195674896240234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.34765625, + "step": 69, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001072406768799 + }, + { + "episode": 1136, + "epoch": 0.006806389378197985, + "loss/policy_avg": -0.17695794999599457, + "lr": 9.955265848670757e-06, + "objective/entropy": 101.99272918701172, + "objective/kl": 12.69788932800293, + "objective/non_score_reward": -0.6348943710327148, + "objective/rlhf_reward": -2.539577692747116, + "objective/scores": 0.0, + "policy/approxkl_avg": 64.835693359375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.44140625, + "step": 70, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0115315914154053 + }, + { + "episode": 1152, + "epoch": 0.0069022540173275335, + "loss/policy_avg": 0.35137245059013367, + "lr": 9.954626789366054e-06, + "objective/entropy": 79.80499267578125, + "objective/kl": 21.120101928710938, + "objective/non_score_reward": -1.0560050010681152, + "objective/rlhf_reward": -2.1013141296067577, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 124.16864776611328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.68359375, + "step": 71, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998917818069458 + }, + { + "episode": 1168, + "epoch": 0.0069981186564570825, + "loss/policy_avg": 0.07422849535942078, + "lr": 9.95398773006135e-06, + "objective/entropy": 9.376724243164062, + "objective/kl": 15.093628883361816, + "objective/non_score_reward": -0.7546814680099487, + "objective/rlhf_reward": -1.6594760653719138, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 47.567962646484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 72, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9958832263946533 + }, + { + "episode": 1184, + "epoch": 0.0070939832955866314, + "loss/policy_avg": 0.11969298124313354, + "lr": 9.953348670756648e-06, + "objective/entropy": 133.57423400878906, + "objective/kl": 20.2343807220459, + "objective/non_score_reward": -1.0117191076278687, + "objective/rlhf_reward": -1.1231571778070655, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 93.79672241210938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.423828125, + "step": 73, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0005669593811035 + }, + { + "episode": 1200, + "epoch": 0.00718984793471618, + "loss/policy_avg": 0.2395152747631073, + "lr": 9.952709611451944e-06, + "objective/entropy": 31.68697166442871, + "objective/kl": 20.96116828918457, + "objective/non_score_reward": -1.0480585098266602, + "objective/rlhf_reward": -2.711281481202006, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 194.83474731445312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.669921875, + "step": 74, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9953577518463135 + }, + { + "episode": 1216, + "epoch": 0.00728571257384573, + "loss/policy_avg": 0.27856501936912537, + "lr": 9.952070552147241e-06, + "objective/entropy": 119.42091369628906, + "objective/kl": 11.30095100402832, + "objective/non_score_reward": -0.5650476217269897, + "objective/rlhf_reward": -0.9185547738367612, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 59.14590835571289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.75, + "step": 75, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9953703880310059 + }, + { + "episode": 1232, + "epoch": 0.007381577212975279, + "loss/policy_avg": 0.21030786633491516, + "lr": 9.951431492842536e-06, + "objective/entropy": 7.310768127441406, + "objective/kl": 6.645857810974121, + "objective/non_score_reward": -0.3322928845882416, + "objective/rlhf_reward": 0.04943063011993787, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 14.611559867858887, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.591796875, + "step": 76, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996391534805298 + }, + { + "episode": 1248, + "epoch": 0.007477441852104828, + "loss/policy_avg": 0.4117072820663452, + "lr": 9.950792433537833e-06, + "objective/entropy": -109.53082275390625, + "objective/kl": 11.825650215148926, + "objective/non_score_reward": -0.5912825465202332, + "objective/rlhf_reward": 0.03486987352371207, + "objective/scores": 0.6, + "policy/approxkl_avg": 19.0810604095459, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6171875, + "step": 77, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981213808059692 + }, + { + "episode": 1264, + "epoch": 0.007573306491234377, + "loss/policy_avg": 0.2597622275352478, + "lr": 9.950153374233129e-06, + "objective/entropy": -29.7529296875, + "objective/kl": 18.43012809753418, + "objective/non_score_reward": -0.9215063452720642, + "objective/rlhf_reward": -2.2860254704952236, + "objective/scores": 0.35, + "policy/approxkl_avg": 267.2847900390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.70703125, + "step": 78, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997544765472412 + }, + { + "episode": 1280, + "epoch": 0.007669171130363926, + "loss/policy_avg": 0.2407466471195221, + "lr": 9.949514314928425e-06, + "objective/entropy": 14.07373046875, + "objective/kl": 20.781753540039062, + "objective/non_score_reward": -1.0390876531600952, + "objective/rlhf_reward": -1.2326316579591956, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 147.4822235107422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.724609375, + "step": 79, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987099170684814 + }, + { + "episode": 1296, + "epoch": 0.007765035769493475, + "loss/policy_avg": 0.17344285547733307, + "lr": 9.948875255623722e-06, + "objective/entropy": 112.44259643554688, + "objective/kl": 10.0985746383667, + "objective/non_score_reward": -0.504928708076477, + "objective/rlhf_reward": 0.38028510808944693, + "objective/scores": 0.6, + "policy/approxkl_avg": 4.8866167068481445, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.443359375, + "step": 80, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0143842697143555 + }, + { + "episode": 1312, + "epoch": 0.007860900408623025, + "loss/policy_avg": 0.14816004037857056, + "lr": 9.94823619631902e-06, + "objective/entropy": 67.11033630371094, + "objective/kl": 17.487518310546875, + "objective/non_score_reward": -0.8743758797645569, + "objective/rlhf_reward": -2.1558679251963193, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 18.69343376159668, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4619140625, + "step": 81, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998113751411438 + }, + { + "episode": 1328, + "epoch": 0.007956765047752574, + "loss/policy_avg": 0.2536642849445343, + "lr": 9.947597137014316e-06, + "objective/entropy": -71.85224914550781, + "objective/kl": 11.223343849182129, + "objective/non_score_reward": -0.5611672401428223, + "objective/rlhf_reward": -0.7637163875654935, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 37.78028869628906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48828125, + "step": 82, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0003702640533447 + }, + { + "episode": 1344, + "epoch": 0.008052629686882123, + "loss/policy_avg": 0.3479039669036865, + "lr": 9.946958077709611e-06, + "objective/entropy": 146.41241455078125, + "objective/kl": 20.458145141601562, + "objective/non_score_reward": -1.0229072570800781, + "objective/rlhf_reward": -2.732379042838497, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 64.28889465332031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.705078125, + "step": 83, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976742267608643 + }, + { + "episode": 1360, + "epoch": 0.008148494326011672, + "loss/policy_avg": 0.10525624454021454, + "lr": 9.946319018404908e-06, + "objective/entropy": -43.42662048339844, + "objective/kl": 13.858359336853027, + "objective/non_score_reward": -0.6929180026054382, + "objective/rlhf_reward": -0.6489658228316642, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 61.37925720214844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48828125, + "step": 84, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0012595653533936 + }, + { + "episode": 1376, + "epoch": 0.00824435896514122, + "loss/policy_avg": 0.3409525156021118, + "lr": 9.945679959100205e-06, + "objective/entropy": 1.5508041381835938, + "objective/kl": 19.05010223388672, + "objective/non_score_reward": -0.9525051116943359, + "objective/rlhf_reward": -2.205900583330708, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 97.6533203125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.63671875, + "step": 85, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000422477722168 + }, + { + "episode": 1392, + "epoch": 0.00834022360427077, + "loss/policy_avg": 0.3110717535018921, + "lr": 9.945040899795502e-06, + "objective/entropy": 215.75965881347656, + "objective/kl": 18.800819396972656, + "objective/non_score_reward": -0.9400409460067749, + "objective/rlhf_reward": -2.156043860975819, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 84.93620300292969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.669921875, + "step": 86, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962902069091797 + }, + { + "episode": 1408, + "epoch": 0.008436088243400319, + "loss/policy_avg": 0.02868543565273285, + "lr": 9.944401840490799e-06, + "objective/entropy": 154.10025024414062, + "objective/kl": 13.492873191833496, + "objective/non_score_reward": -0.6746436357498169, + "objective/rlhf_reward": -0.9652413214246431, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 42.483882904052734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44921875, + "step": 87, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983662366867065 + }, + { + "episode": 1424, + "epoch": 0.008531952882529868, + "loss/policy_avg": 0.07607420533895493, + "lr": 9.943762781186096e-06, + "objective/entropy": 202.40365600585938, + "objective/kl": 13.719297409057617, + "objective/non_score_reward": -0.685964822769165, + "objective/rlhf_reward": 1.6561407089233402, + "objective/scores": 1.1, + "policy/approxkl_avg": 20.57819175720215, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.728515625, + "step": 88, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999366283416748 + }, + { + "episode": 1440, + "epoch": 0.008627817521659416, + "loss/policy_avg": 0.16665664315223694, + "lr": 9.94312372188139e-06, + "objective/entropy": -100.20193481445312, + "objective/kl": 15.216776847839355, + "objective/non_score_reward": -0.7608388662338257, + "objective/rlhf_reward": -1.4392355120817002, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 85.36731719970703, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.626953125, + "step": 89, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990196228027344 + }, + { + "episode": 1456, + "epoch": 0.008723682160788965, + "loss/policy_avg": 0.19817781448364258, + "lr": 9.942484662576688e-06, + "objective/entropy": -0.7409725189208984, + "objective/kl": 10.389724731445312, + "objective/non_score_reward": -0.5194862484931946, + "objective/rlhf_reward": 2.3220549762248996, + "objective/scores": 1.1, + "policy/approxkl_avg": 12.642692565917969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.52734375, + "step": 90, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989277124404907 + }, + { + "episode": 1472, + "epoch": 0.008819546799918514, + "loss/policy_avg": 0.2365586757659912, + "lr": 9.941845603271985e-06, + "objective/entropy": 152.64306640625, + "objective/kl": 21.58309555053711, + "objective/non_score_reward": -1.0791547298431396, + "objective/rlhf_reward": -2.9573691723093223, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 87.72661590576172, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.771484375, + "step": 91, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999784231185913 + }, + { + "episode": 1488, + "epoch": 0.008915411439048063, + "loss/policy_avg": 0.059907689690589905, + "lr": 9.941206543967281e-06, + "objective/entropy": 89.6580810546875, + "objective/kl": 16.996726989746094, + "objective/non_score_reward": -0.8498364686965942, + "objective/rlhf_reward": -1.9755135669308581, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 72.40145874023438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.861328125, + "step": 92, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003039836883545 + }, + { + "episode": 1504, + "epoch": 0.009011276078177614, + "loss/policy_avg": 0.14265713095664978, + "lr": 9.940567484662578e-06, + "objective/entropy": -33.708492279052734, + "objective/kl": 15.94516372680664, + "objective/non_score_reward": -0.797258198261261, + "objective/rlhf_reward": -0.2653137638580527, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 78.95989990234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53125, + "step": 93, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997810959815979 + }, + { + "episode": 1520, + "epoch": 0.009107140717307163, + "loss/policy_avg": -0.018713245168328285, + "lr": 9.939928425357874e-06, + "objective/entropy": -3.091245651245117, + "objective/kl": 14.482427597045898, + "objective/non_score_reward": -0.7241213917732239, + "objective/rlhf_reward": -1.2346261046534641, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 56.76847839355469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.501953125, + "step": 94, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993091821670532 + }, + { + "episode": 1536, + "epoch": 0.009203005356436712, + "loss/policy_avg": -0.0069353943690657616, + "lr": 9.93928936605317e-06, + "objective/entropy": 95.46006774902344, + "objective/kl": 20.928672790527344, + "objective/non_score_reward": -1.046433687210083, + "objective/rlhf_reward": -2.360906060012888, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 103.58160400390625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58203125, + "step": 95, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974098205566406 + }, + { + "episode": 1552, + "epoch": 0.009298869995566261, + "loss/policy_avg": 0.0523187518119812, + "lr": 9.938650306748467e-06, + "objective/entropy": 16.342994689941406, + "objective/kl": 20.205509185791016, + "objective/non_score_reward": -1.0102753639221191, + "objective/rlhf_reward": 0.35889836549758947, + "objective/scores": 1.1, + "policy/approxkl_avg": 84.55277252197266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4697265625, + "step": 96, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000453233718872 + }, + { + "episode": 1568, + "epoch": 0.00939473463469581, + "loss/policy_avg": 0.18428044021129608, + "lr": 9.938011247443764e-06, + "objective/entropy": -31.386062622070312, + "objective/kl": 19.641075134277344, + "objective/non_score_reward": -0.9820537567138672, + "objective/rlhf_reward": -1.8055088541665412, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 92.56884002685547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59765625, + "step": 97, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0001230239868164 + }, + { + "episode": 1584, + "epoch": 0.009490599273825359, + "loss/policy_avg": -0.11768925935029984, + "lr": 9.937372188139061e-06, + "objective/entropy": -29.0854434967041, + "objective/kl": 16.647226333618164, + "objective/non_score_reward": -0.8323614001274109, + "objective/rlhf_reward": -1.9701957342371177, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.0866272449493408, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.541015625, + "step": 98, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0018460750579834 + }, + { + "episode": 1600, + "epoch": 0.009586463912954908, + "loss/policy_avg": 0.06727765500545502, + "lr": 9.936733128834358e-06, + "objective/entropy": 96.53413391113281, + "objective/kl": 21.015684127807617, + "objective/non_score_reward": -1.0507843494415283, + "objective/rlhf_reward": -2.8031371593475343, + "objective/scores": 0.35, + "policy/approxkl_avg": 36.56340026855469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 99, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9949061870574951 + }, + { + "episode": 1616, + "epoch": 0.009682328552084457, + "loss/policy_avg": 0.28386813402175903, + "lr": 9.936094069529653e-06, + "objective/entropy": 33.901954650878906, + "objective/kl": 19.533782958984375, + "objective/non_score_reward": -0.9766892194747925, + "objective/rlhf_reward": -2.425804230387568, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 162.0339813232422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859375, + "step": 100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985501766204834 + }, + { + "episode": 1632, + "epoch": 0.009778193191214006, + "loss/policy_avg": 0.11220409721136093, + "lr": 9.93545501022495e-06, + "objective/entropy": -3.93096923828125, + "objective/kl": 22.981700897216797, + "objective/non_score_reward": -1.1490850448608398, + "objective/rlhf_reward": -3.1725080504017744, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 46.0514030456543, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6328125, + "step": 101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0070252418518066 + }, + { + "episode": 1648, + "epoch": 0.009874057830343555, + "loss/policy_avg": 0.20420242846012115, + "lr": 9.934815950920245e-06, + "objective/entropy": 198.98751831054688, + "objective/kl": 17.92270278930664, + "objective/non_score_reward": -0.8961352109909058, + "objective/rlhf_reward": -1.759712155136179, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 55.74137878417969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980387687683105 + }, + { + "episode": 1664, + "epoch": 0.009969922469473104, + "loss/policy_avg": 0.27041423320770264, + "lr": 9.934176891615542e-06, + "objective/entropy": 1.5637626647949219, + "objective/kl": 12.633028030395508, + "objective/non_score_reward": -0.6316514015197754, + "objective/rlhf_reward": -0.7017769768563022, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 13.92137622833252, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4208984375, + "step": 103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987752437591553 + }, + { + "episode": 1680, + "epoch": 0.010065787108602653, + "loss/policy_avg": 0.318324476480484, + "lr": 9.933537832310839e-06, + "objective/entropy": 218.76858520507812, + "objective/kl": 21.40100860595703, + "objective/non_score_reward": -1.0700504779815674, + "objective/rlhf_reward": -2.9385662584597165, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 90.99249267578125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.677734375, + "step": 104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998801827430725 + }, + { + "episode": 1696, + "epoch": 0.010161651747732202, + "loss/policy_avg": 0.3075984716415405, + "lr": 9.932898773006136e-06, + "objective/entropy": -56.81090545654297, + "objective/kl": 10.457717895507812, + "objective/non_score_reward": -0.5228859186172485, + "objective/rlhf_reward": -0.7129414687431871, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 48.63943862915039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.995776653289795 + }, + { + "episode": 1712, + "epoch": 0.01025751638686175, + "loss/policy_avg": 0.5551585555076599, + "lr": 9.932259713701433e-06, + "objective/entropy": -48.12900924682617, + "objective/kl": 21.915470123291016, + "objective/non_score_reward": -1.0957735776901245, + "objective/rlhf_reward": -1.459375207067701, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 33.369083404541016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.71484375, + "step": 106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.995157241821289 + }, + { + "episode": 1728, + "epoch": 0.0103533810259913, + "loss/policy_avg": 0.252463161945343, + "lr": 9.931620654396728e-06, + "objective/entropy": -69.64755249023438, + "objective/kl": 15.248108863830566, + "objective/non_score_reward": -0.7624054551124573, + "objective/rlhf_reward": -1.707986166983276, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 59.05755615234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7578125, + "step": 107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9963257312774658 + }, + { + "episode": 1744, + "epoch": 0.01044924566512085, + "loss/policy_avg": 0.13919854164123535, + "lr": 9.930981595092025e-06, + "objective/entropy": -133.55258178710938, + "objective/kl": 17.2213134765625, + "objective/non_score_reward": -0.8610656261444092, + "objective/rlhf_reward": -2.0850126979097556, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 32.41887664794922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5234375, + "step": 108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992578029632568 + }, + { + "episode": 1760, + "epoch": 0.010545110304250399, + "loss/policy_avg": 0.5300755500793457, + "lr": 9.930342535787322e-06, + "objective/entropy": -9.471179962158203, + "objective/kl": 18.607471466064453, + "objective/non_score_reward": -0.9303736090660095, + "objective/rlhf_reward": -2.3214945554733273, + "objective/scores": 0.35, + "policy/approxkl_avg": 31.75185203552246, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.654296875, + "step": 109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994070529937744 + }, + { + "episode": 1776, + "epoch": 0.010640974943379948, + "loss/policy_avg": 0.17107412219047546, + "lr": 9.929703476482619e-06, + "objective/entropy": 72.44110107421875, + "objective/kl": 16.862125396728516, + "objective/non_score_reward": -0.8431062698364258, + "objective/rlhf_reward": -3.372425138950348, + "objective/scores": 0.0, + "policy/approxkl_avg": 66.22834777832031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.70703125, + "step": 110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.995293378829956 + }, + { + "episode": 1792, + "epoch": 0.010736839582509497, + "loss/policy_avg": -0.11443672329187393, + "lr": 9.929064417177915e-06, + "objective/entropy": 80.82670593261719, + "objective/kl": 18.79993438720703, + "objective/non_score_reward": -0.9399967789649963, + "objective/rlhf_reward": -2.336154927213756, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 31.270248413085938, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5625, + "step": 111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.007622241973877 + }, + { + "episode": 1808, + "epoch": 0.010832704221639046, + "loss/policy_avg": 0.0878123939037323, + "lr": 9.928425357873212e-06, + "objective/entropy": -118.92440795898438, + "objective/kl": 17.83495330810547, + "objective/non_score_reward": -0.8917477130889893, + "objective/rlhf_reward": -2.2253551392847593, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 20.88257598876953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.996976613998413 + }, + { + "episode": 1824, + "epoch": 0.010928568860768595, + "loss/policy_avg": 0.18364591896533966, + "lr": 9.927786298568507e-06, + "objective/entropy": 8.144821166992188, + "objective/kl": 14.821235656738281, + "objective/non_score_reward": -0.741061806678772, + "objective/rlhf_reward": -1.2309138337771097, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 17.778968811035156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000706911087036 + }, + { + "episode": 1840, + "epoch": 0.011024433499898144, + "loss/policy_avg": 0.06979192793369293, + "lr": 9.927147239263804e-06, + "objective/entropy": -2.9724502563476562, + "objective/kl": 17.076000213623047, + "objective/non_score_reward": -0.8538000583648682, + "objective/rlhf_reward": -1.8994284508549533, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 46.98078918457031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.798828125, + "step": 114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999396800994873 + }, + { + "episode": 1856, + "epoch": 0.011120298139027693, + "loss/policy_avg": 0.27465301752090454, + "lr": 9.926508179959101e-06, + "objective/entropy": 40.056610107421875, + "objective/kl": 22.515907287597656, + "objective/non_score_reward": -1.1257953643798828, + "objective/rlhf_reward": -2.8413221291905506, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 81.93817138671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0007479190826416 + }, + { + "episode": 1872, + "epoch": 0.011216162778157242, + "loss/policy_avg": 0.3945024013519287, + "lr": 9.925869120654398e-06, + "objective/entropy": 69.15873718261719, + "objective/kl": 21.74050521850586, + "objective/non_score_reward": -1.0870254039764404, + "objective/rlhf_reward": -3.0225888824760148, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 38.46895980834961, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59765625, + "step": 116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0014419555664062 + }, + { + "episode": 1888, + "epoch": 0.01131202741728679, + "loss/policy_avg": 0.5689772367477417, + "lr": 9.925230061349695e-06, + "objective/entropy": 144.26678466796875, + "objective/kl": 14.530990600585938, + "objective/non_score_reward": -0.726549506187439, + "objective/rlhf_reward": -1.1728648702303568, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.715579628944397, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8203125, + "step": 117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0058815479278564 + }, + { + "episode": 1904, + "epoch": 0.01140789205641634, + "loss/policy_avg": -0.025625256821513176, + "lr": 9.92459100204499e-06, + "objective/entropy": -91.6683120727539, + "objective/kl": 16.61312484741211, + "objective/non_score_reward": -0.8306561708450317, + "objective/rlhf_reward": -1.944022663918835, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 18.064186096191406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4990234375, + "step": 118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999868392944336 + }, + { + "episode": 1920, + "epoch": 0.011503756695545889, + "loss/policy_avg": 0.4135175943374634, + "lr": 9.923951942740287e-06, + "objective/entropy": 145.33905029296875, + "objective/kl": 18.559207916259766, + "objective/non_score_reward": -0.9279603958129883, + "objective/rlhf_reward": -1.5891353509583808, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 19.033662796020508, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981472492218018 + }, + { + "episode": 1936, + "epoch": 0.011599621334675438, + "loss/policy_avg": 0.3322446942329407, + "lr": 9.923312883435584e-06, + "objective/entropy": 109.6761474609375, + "objective/kl": 18.231651306152344, + "objective/non_score_reward": -0.9115825891494751, + "objective/rlhf_reward": -1.2463304907083512, + "objective/scores": 0.6, + "policy/approxkl_avg": 108.51126098632812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65625, + "step": 120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996593952178955 + }, + { + "episode": 1952, + "epoch": 0.011695485973804987, + "loss/policy_avg": 0.22522342205047607, + "lr": 9.92267382413088e-06, + "objective/entropy": 95.46246337890625, + "objective/kl": 16.838998794555664, + "objective/non_score_reward": -0.841949999332428, + "objective/rlhf_reward": -1.8520282743298375, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 14.038084983825684, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8046875, + "step": 121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997774362564087 + }, + { + "episode": 1968, + "epoch": 0.011791350612934537, + "loss/policy_avg": 0.18379229307174683, + "lr": 9.922034764826178e-06, + "objective/entropy": 138.12388610839844, + "objective/kl": 25.93743324279785, + "objective/non_score_reward": -1.2968716621398926, + "objective/rlhf_reward": -3.828236812089367, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 26.206398010253906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.734375, + "step": 122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0011606216430664 + }, + { + "episode": 1984, + "epoch": 0.011887215252064086, + "loss/policy_avg": 0.31653979420661926, + "lr": 9.921395705521473e-06, + "objective/entropy": -44.61676788330078, + "objective/kl": 21.166324615478516, + "objective/non_score_reward": -1.0583162307739258, + "objective/rlhf_reward": -2.9077520704566666, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 29.74887466430664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.521484375, + "step": 123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996273517608643 + }, + { + "episode": 2000, + "epoch": 0.011983079891193635, + "loss/policy_avg": 0.1589316874742508, + "lr": 9.92075664621677e-06, + "objective/entropy": -77.4912109375, + "objective/kl": 20.79126739501953, + "objective/non_score_reward": -1.0395634174346924, + "objective/rlhf_reward": -2.4249199191729227, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 133.58343505859375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.66015625, + "step": 124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9961724281311035 + }, + { + "episode": 2016, + "epoch": 0.012078944530323184, + "loss/policy_avg": 0.2586688995361328, + "lr": 9.920117586912067e-06, + "objective/entropy": 139.38818359375, + "objective/kl": 21.455245971679688, + "objective/non_score_reward": -1.072762370109558, + "objective/rlhf_reward": -2.775277876647648, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 47.609947204589844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8125, + "step": 125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975237846374512 + }, + { + "episode": 2032, + "epoch": 0.012174809169452733, + "loss/policy_avg": 0.16066747903823853, + "lr": 9.919478527607362e-06, + "objective/entropy": 72.43231201171875, + "objective/kl": 20.59688377380371, + "objective/non_score_reward": -1.0298442840576172, + "objective/rlhf_reward": 0.28062304258346593, + "objective/scores": 1.1, + "policy/approxkl_avg": 75.74966430664062, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.529296875, + "step": 126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998389720916748 + }, + { + "episode": 2048, + "epoch": 0.012270673808582282, + "loss/policy_avg": 0.07932023704051971, + "lr": 9.918839468302659e-06, + "objective/entropy": -12.7745361328125, + "objective/kl": 20.53061294555664, + "objective/non_score_reward": -1.0265307426452637, + "objective/rlhf_reward": -2.7275206232942164, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 19.110069274902344, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.55859375, + "step": 127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984248876571655 + }, + { + "episode": 2064, + "epoch": 0.012366538447711831, + "loss/policy_avg": 0.27331969141960144, + "lr": 9.918200408997956e-06, + "objective/entropy": 101.82013702392578, + "objective/kl": 18.18286895751953, + "objective/non_score_reward": -0.9091434478759766, + "objective/rlhf_reward": -2.2579716230310023, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 6.703115463256836, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.556640625, + "step": 128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0009617805480957 + }, + { + "episode": 2080, + "epoch": 0.01246240308684138, + "loss/policy_avg": 0.4916057586669922, + "lr": 9.917561349693252e-06, + "objective/entropy": 88.1321029663086, + "objective/kl": 23.30657958984375, + "objective/non_score_reward": -1.165329098701477, + "objective/rlhf_reward": -3.3020663795217704, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 142.93795776367188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967398643493652 + }, + { + "episode": 2096, + "epoch": 0.012558267725970929, + "loss/policy_avg": 0.16071423888206482, + "lr": 9.91692229038855e-06, + "objective/entropy": 136.1899871826172, + "objective/kl": 15.380975723266602, + "objective/non_score_reward": -0.769048810005188, + "objective/rlhf_reward": -0.6761951804161073, + "objective/scores": 0.6, + "policy/approxkl_avg": 28.551767349243164, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.56640625, + "step": 130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.03311824798584 + }, + { + "episode": 2112, + "epoch": 0.012654132365100478, + "loss/policy_avg": 0.0021135974675416946, + "lr": 9.916283231083844e-06, + "objective/entropy": -71.15084838867188, + "objective/kl": 18.961715698242188, + "objective/non_score_reward": -0.9480857849121094, + "objective/rlhf_reward": -2.1304838709241016, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 1.844127893447876, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4833984375, + "step": 131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009913444519043 + }, + { + "episode": 2128, + "epoch": 0.012749997004230027, + "loss/policy_avg": 0.042635850608348846, + "lr": 9.915644171779141e-06, + "objective/entropy": 20.673603057861328, + "objective/kl": 15.986173629760742, + "objective/non_score_reward": -0.7993086576461792, + "objective/rlhf_reward": -1.8555989473158414, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 36.049034118652344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.67578125, + "step": 132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998380422592163 + }, + { + "episode": 2144, + "epoch": 0.012845861643359576, + "loss/policy_avg": 0.46513473987579346, + "lr": 9.915005112474438e-06, + "objective/entropy": 5.5274505615234375, + "objective/kl": 19.590290069580078, + "objective/non_score_reward": -0.979514479637146, + "objective/rlhf_reward": -2.5394558692849696, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 12.074180603027344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.642578125, + "step": 133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0011277198791504 + }, + { + "episode": 2160, + "epoch": 0.012941726282489125, + "loss/policy_avg": 0.245748370885849, + "lr": 9.914366053169735e-06, + "objective/entropy": 65.60797119140625, + "objective/kl": 19.637710571289062, + "objective/non_score_reward": -0.9818854928016663, + "objective/rlhf_reward": -1.980130786971982, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 50.17578125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.791015625, + "step": 134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983665943145752 + }, + { + "episode": 2176, + "epoch": 0.013037590921618674, + "loss/policy_avg": 0.02180427499115467, + "lr": 9.913726993865032e-06, + "objective/entropy": 0.8936500549316406, + "objective/kl": 24.33076286315918, + "objective/non_score_reward": -1.2165381908416748, + "objective/rlhf_reward": -3.524517109900146, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 69.30375671386719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5009765625, + "step": 135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99745774269104 + }, + { + "episode": 2192, + "epoch": 0.013133455560748224, + "loss/policy_avg": 0.36717042326927185, + "lr": 9.913087934560329e-06, + "objective/entropy": 83.415283203125, + "objective/kl": 21.930896759033203, + "objective/non_score_reward": -1.0965447425842285, + "objective/rlhf_reward": -1.4624603136789527, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 79.15277862548828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.546875, + "step": 136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998199701309204 + }, + { + "episode": 2208, + "epoch": 0.013229320199877773, + "loss/policy_avg": 0.2460360825061798, + "lr": 9.912448875255624e-06, + "objective/entropy": 137.11976623535156, + "objective/kl": 21.218502044677734, + "objective/non_score_reward": -1.060925006866455, + "objective/rlhf_reward": -2.8198681666451373, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 67.851806640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.666015625, + "step": 137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9969704151153564 + }, + { + "episode": 2224, + "epoch": 0.013325184839007322, + "loss/policy_avg": 0.21244561672210693, + "lr": 9.911809815950921e-06, + "objective/entropy": 175.0180206298828, + "objective/kl": 16.889467239379883, + "objective/non_score_reward": -0.8444733619689941, + "objective/rlhf_reward": -1.4304821593331654, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 78.4537353515625, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.515625, + "step": 138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985227584838867 + }, + { + "episode": 2240, + "epoch": 0.013421049478136871, + "loss/policy_avg": 0.18417471647262573, + "lr": 9.911170756646218e-06, + "objective/entropy": 224.734619140625, + "objective/kl": 33.112342834472656, + "objective/non_score_reward": -1.6556169986724854, + "objective/rlhf_reward": -4.889135018984477, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 160.8165283203125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7109375, + "step": 139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992772340774536 + }, + { + "episode": 2256, + "epoch": 0.01351691411726642, + "loss/policy_avg": 0.40639203786849976, + "lr": 9.910531697341515e-06, + "objective/entropy": 69.94343566894531, + "objective/kl": 24.266616821289062, + "objective/non_score_reward": -1.2133309841156006, + "objective/rlhf_reward": -3.40272543868576, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 126.5036392211914, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5625, + "step": 140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999225378036499 + }, + { + "episode": 2272, + "epoch": 0.01361277875639597, + "loss/policy_avg": 0.28501349687576294, + "lr": 9.909892638036812e-06, + "objective/entropy": 61.523101806640625, + "objective/kl": 17.776689529418945, + "objective/non_score_reward": -0.8888344764709473, + "objective/rlhf_reward": -1.8220045725504557, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 87.0567398071289, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.537109375, + "step": 141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000370740890503 + }, + { + "episode": 2288, + "epoch": 0.013708643395525518, + "loss/policy_avg": 0.30668091773986816, + "lr": 9.909253578732107e-06, + "objective/entropy": 227.46041870117188, + "objective/kl": 20.17832374572754, + "objective/non_score_reward": -1.0089161396026611, + "objective/rlhf_reward": -2.5198930142247047, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 50.498268127441406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.685546875, + "step": 142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999216079711914 + }, + { + "episode": 2304, + "epoch": 0.013804508034655067, + "loss/policy_avg": 0.3348355293273926, + "lr": 9.908614519427404e-06, + "objective/entropy": 164.50863647460938, + "objective/kl": 13.646249771118164, + "objective/non_score_reward": -0.6823124885559082, + "objective/rlhf_reward": -1.1251298821607407, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 63.31299591064453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.95703125, + "step": 143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986834526062012 + }, + { + "episode": 2320, + "epoch": 0.013900372673784616, + "loss/policy_avg": 0.7517778277397156, + "lr": 9.9079754601227e-06, + "objective/entropy": -69.42684936523438, + "objective/kl": 13.007519721984863, + "objective/non_score_reward": -0.6503760814666748, + "objective/rlhf_reward": -0.2015041172504426, + "objective/scores": 0.6, + "policy/approxkl_avg": 15.501136779785156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.533203125, + "step": 144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9969980716705322 + }, + { + "episode": 2336, + "epoch": 0.013996237312914165, + "loss/policy_avg": 0.1666509509086609, + "lr": 9.907336400817996e-06, + "objective/entropy": 175.3941192626953, + "objective/kl": 20.383106231689453, + "objective/non_score_reward": -1.0191553831100464, + "objective/rlhf_reward": -2.414762055099593, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 102.40309143066406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65625, + "step": 145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960074424743652 + }, + { + "episode": 2352, + "epoch": 0.014092101952043714, + "loss/policy_avg": 0.08111919462680817, + "lr": 9.906697341513293e-06, + "objective/entropy": 66.45804595947266, + "objective/kl": 20.63641357421875, + "objective/non_score_reward": -1.0318206548690796, + "objective/rlhf_reward": -2.7680326637968253, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 16.144962310791016, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.44921875, + "step": 146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003509521484375 + }, + { + "episode": 2368, + "epoch": 0.014187966591173263, + "loss/policy_avg": 0.2162848860025406, + "lr": 9.90605828220859e-06, + "objective/entropy": 66.34003448486328, + "objective/kl": 21.03724479675293, + "objective/non_score_reward": -1.051862359046936, + "objective/rlhf_reward": -1.8074494361877442, + "objective/scores": 0.6, + "policy/approxkl_avg": 56.59767150878906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.544921875, + "step": 147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9967288970947266 + }, + { + "episode": 2384, + "epoch": 0.014283831230302812, + "loss/policy_avg": 0.13452857732772827, + "lr": 9.905419222903886e-06, + "objective/entropy": 160.91929626464844, + "objective/kl": 22.133365631103516, + "objective/non_score_reward": -1.10666823387146, + "objective/rlhf_reward": -2.693339631954829, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 64.49358367919922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62109375, + "step": 148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988274574279785 + }, + { + "episode": 2400, + "epoch": 0.01437969586943236, + "loss/policy_avg": 1.6826289892196655, + "lr": 9.904780163599183e-06, + "objective/entropy": -182.28018188476562, + "objective/kl": 22.543842315673828, + "objective/non_score_reward": -1.1271920204162598, + "objective/rlhf_reward": -3.084936280449001, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 70.59880828857422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62890625, + "step": 149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0008223056793213 + }, + { + "episode": 2416, + "epoch": 0.01447556050856191, + "loss/policy_avg": 0.4059183597564697, + "lr": 9.904141104294478e-06, + "objective/entropy": 225.73135375976562, + "objective/kl": 23.115840911865234, + "objective/non_score_reward": -1.1557921171188354, + "objective/rlhf_reward": -2.8898351351420084, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 45.14168930053711, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.703125, + "step": 150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997532367706299 + }, + { + "episode": 2432, + "epoch": 0.01457142514769146, + "loss/policy_avg": 0.10681919753551483, + "lr": 9.903502044989775e-06, + "objective/entropy": 213.69598388671875, + "objective/kl": 26.178190231323242, + "objective/non_score_reward": -1.3089096546173096, + "objective/rlhf_reward": -3.894002726584106, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 92.52935791015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69140625, + "step": 151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975218772888184 + }, + { + "episode": 2448, + "epoch": 0.01466728978682101, + "loss/policy_avg": -0.2853464186191559, + "lr": 9.902862985685072e-06, + "objective/entropy": 58.680572509765625, + "objective/kl": 17.81705665588379, + "objective/non_score_reward": -0.8908528089523315, + "objective/rlhf_reward": -0.6396921619188514, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 89.08941650390625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.669921875, + "step": 152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0143747329711914 + }, + { + "episode": 2464, + "epoch": 0.014763154425950558, + "loss/policy_avg": 0.07825072109699249, + "lr": 9.902223926380369e-06, + "objective/entropy": 198.86288452148438, + "objective/kl": 28.436542510986328, + "objective/non_score_reward": -1.4218271970748901, + "objective/rlhf_reward": -2.7635896548044414, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 44.41461181640625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994783401489258 + }, + { + "episode": 2480, + "epoch": 0.014859019065080107, + "loss/policy_avg": 0.27155977487564087, + "lr": 9.901584867075666e-06, + "objective/entropy": 89.04707336425781, + "objective/kl": 21.113758087158203, + "objective/non_score_reward": -1.0556879043579102, + "objective/rlhf_reward": -1.2990326031458106, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 58.70441818237305, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.623046875, + "step": 154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971623420715332 + }, + { + "episode": 2496, + "epoch": 0.014954883704209656, + "loss/policy_avg": 0.3080964982509613, + "lr": 9.900945807770961e-06, + "objective/entropy": 35.38983154296875, + "objective/kl": 21.02568817138672, + "objective/non_score_reward": -1.0512844324111938, + "objective/rlhf_reward": -2.7241851715401406, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 52.82551193237305, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.560546875, + "step": 155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9960044622421265 + }, + { + "episode": 2512, + "epoch": 0.015050748343339205, + "loss/policy_avg": 4.562356472015381, + "lr": 9.900306748466258e-06, + "objective/entropy": 253.11752319335938, + "objective/kl": 22.01451301574707, + "objective/non_score_reward": -1.1007256507873535, + "objective/rlhf_reward": -2.798782501284199, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 74.26364135742188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.765625, + "step": 156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9965643882751465 + }, + { + "episode": 2528, + "epoch": 0.015146612982468754, + "loss/policy_avg": 0.21197248995304108, + "lr": 9.899667689161555e-06, + "objective/entropy": 149.58770751953125, + "objective/kl": 23.317626953125, + "objective/non_score_reward": -1.1658812761306763, + "objective/rlhf_reward": -2.2635251045227047, + "objective/scores": 0.6, + "policy/approxkl_avg": 51.574981689453125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4736328125, + "step": 157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.995574951171875 + }, + { + "episode": 2544, + "epoch": 0.015242477621598303, + "loss/policy_avg": 0.20880039036273956, + "lr": 9.899028629856852e-06, + "objective/entropy": -64.38532257080078, + "objective/kl": 25.92443084716797, + "objective/non_score_reward": -1.2962216138839722, + "objective/rlhf_reward": -3.784886217117309, + "objective/scores": 0.35, + "policy/approxkl_avg": 138.45706176757812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.568359375, + "step": 158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9968822002410889 + }, + { + "episode": 2560, + "epoch": 0.015338342260727852, + "loss/policy_avg": 0.21600359678268433, + "lr": 9.898389570552149e-06, + "objective/entropy": 3.545970916748047, + "objective/kl": 23.09051513671875, + "objective/non_score_reward": -1.1545257568359375, + "objective/rlhf_reward": -2.6706922007369354, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 36.885650634765625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.55859375, + "step": 159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993218183517456 + }, + { + "episode": 2576, + "epoch": 0.015434206899857401, + "loss/policy_avg": 0.5031390190124512, + "lr": 9.897750511247446e-06, + "objective/entropy": 98.00604248046875, + "objective/kl": 25.33047866821289, + "objective/non_score_reward": -1.2665239572525024, + "objective/rlhf_reward": -3.4619760847726635, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 83.63774871826172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000274896621704 + }, + { + "episode": 2592, + "epoch": 0.01553007153898695, + "loss/policy_avg": 0.018053412437438965, + "lr": 9.89711145194274e-06, + "objective/entropy": 2.8434524536132812, + "objective/kl": 24.395084381103516, + "objective/non_score_reward": -1.2197542190551758, + "objective/rlhf_reward": -3.2171576074963673, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 1.6353378295898438, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.64453125, + "step": 161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001478433609009 + }, + { + "episode": 2608, + "epoch": 0.0156259361781165, + "loss/policy_avg": 0.25576311349868774, + "lr": 9.896472392638038e-06, + "objective/entropy": -64.24278259277344, + "objective/kl": 16.287256240844727, + "objective/non_score_reward": -0.8143627643585205, + "objective/rlhf_reward": -1.5241178731123606, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 25.824050903320312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6953125, + "step": 162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984066486358643 + }, + { + "episode": 2624, + "epoch": 0.01572180081724605, + "loss/policy_avg": 0.2750253677368164, + "lr": 9.895833333333334e-06, + "objective/entropy": 170.5203857421875, + "objective/kl": 35.09113693237305, + "objective/non_score_reward": -1.7545567750930786, + "objective/rlhf_reward": -4.094508086086485, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 91.88323974609375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.76171875, + "step": 163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9978190660476685 + }, + { + "episode": 2640, + "epoch": 0.0158176654563756, + "loss/policy_avg": 0.2685161828994751, + "lr": 9.895194274028631e-06, + "objective/entropy": 107.911376953125, + "objective/kl": 21.708637237548828, + "objective/non_score_reward": -1.0854318141937256, + "objective/rlhf_reward": -2.8911290570214834, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 48.546165466308594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.603515625, + "step": 164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9939230680465698 + }, + { + "episode": 2656, + "epoch": 0.015913530095505148, + "loss/policy_avg": 0.3802343010902405, + "lr": 9.894555214723928e-06, + "objective/entropy": 137.427978515625, + "objective/kl": 20.673809051513672, + "objective/non_score_reward": -1.0336904525756836, + "objective/rlhf_reward": -2.793125978022247, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 36.90850830078125, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.63671875, + "step": 165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987661838531494 + }, + { + "episode": 2672, + "epoch": 0.016009394734634697, + "loss/policy_avg": 0.0008638650178909302, + "lr": 9.893916155419225e-06, + "objective/entropy": 159.45681762695312, + "objective/kl": 20.339492797851562, + "objective/non_score_reward": -1.016974687576294, + "objective/rlhf_reward": -2.7086488542303275, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 6.459288597106934, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.515625, + "step": 166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977226257324219 + }, + { + "episode": 2688, + "epoch": 0.016105259373764245, + "loss/policy_avg": 0.3463206887245178, + "lr": 9.89327709611452e-06, + "objective/entropy": -75.2735824584961, + "objective/kl": 27.865215301513672, + "objective/non_score_reward": -1.3932607173919678, + "objective/rlhf_reward": -4.173042631149292, + "objective/scores": 0.35, + "policy/approxkl_avg": 139.90060424804688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.587890625, + "step": 167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0016684532165527 + }, + { + "episode": 2704, + "epoch": 0.016201124012893794, + "loss/policy_avg": 0.07642253488302231, + "lr": 9.892638036809815e-06, + "objective/entropy": 38.99913787841797, + "objective/kl": 19.061498641967773, + "objective/non_score_reward": -0.9530749320983887, + "objective/rlhf_reward": -1.987470920356821, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 22.035629272460938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.484375, + "step": 168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0013604164123535 + }, + { + "episode": 2720, + "epoch": 0.016296988652023343, + "loss/policy_avg": 0.2990867495536804, + "lr": 9.891998977505112e-06, + "objective/entropy": 199.7046661376953, + "objective/kl": 23.46067237854004, + "objective/non_score_reward": -1.1730337142944336, + "objective/rlhf_reward": -3.268302519519893, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 19.572267532348633, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6171875, + "step": 169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998270034790039 + }, + { + "episode": 2736, + "epoch": 0.016392853291152892, + "loss/policy_avg": 0.3040146231651306, + "lr": 9.89135991820041e-06, + "objective/entropy": 84.5781021118164, + "objective/kl": 24.218996047973633, + "objective/non_score_reward": -1.2109497785568237, + "objective/rlhf_reward": -2.896387885289128, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 91.4429931640625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007870197296143 + }, + { + "episode": 2752, + "epoch": 0.01648871793028244, + "loss/policy_avg": 0.24132516980171204, + "lr": 9.890720858895706e-06, + "objective/entropy": 25.26891326904297, + "objective/kl": 12.311616897583008, + "objective/non_score_reward": -0.6155807971954346, + "objective/rlhf_reward": -2.4623232781887054, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.089572906494141, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984550476074219 + }, + { + "episode": 2768, + "epoch": 0.01658458256941199, + "loss/policy_avg": 0.07815683633089066, + "lr": 9.890081799591003e-06, + "objective/entropy": -2.7739601135253906, + "objective/kl": 20.480499267578125, + "objective/non_score_reward": -1.0240248441696167, + "objective/rlhf_reward": -2.6151468185738325, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 11.766371726989746, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.52734375, + "step": 172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999391794204712 + }, + { + "episode": 2784, + "epoch": 0.01668044720854154, + "loss/policy_avg": 0.31003671884536743, + "lr": 9.8894427402863e-06, + "objective/entropy": -5.804538726806641, + "objective/kl": 23.551572799682617, + "objective/non_score_reward": -1.1775786876678467, + "objective/rlhf_reward": -3.2597167297319025, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 241.19540405273438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.587890625, + "step": 173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990514516830444 + }, + { + "episode": 2800, + "epoch": 0.016776311847671088, + "loss/policy_avg": 0.027285143733024597, + "lr": 9.888803680981595e-06, + "objective/entropy": 91.14071655273438, + "objective/kl": 19.611085891723633, + "objective/non_score_reward": -0.9805543422698975, + "objective/rlhf_reward": -2.44126462471044, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 60.10600662231445, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.537109375, + "step": 174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972784519195557 + }, + { + "episode": 2816, + "epoch": 0.016872176486800637, + "loss/policy_avg": 0.2845172882080078, + "lr": 9.888164621676892e-06, + "objective/entropy": 30.190153121948242, + "objective/kl": 24.783939361572266, + "objective/non_score_reward": -1.239197015762329, + "objective/rlhf_reward": -3.578185775367123, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 76.30748748779297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.443359375, + "step": 175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994440078735352 + }, + { + "episode": 2832, + "epoch": 0.016968041125930186, + "loss/policy_avg": 0.5662503838539124, + "lr": 9.887525562372189e-06, + "objective/entropy": 60.807342529296875, + "objective/kl": 12.370782852172852, + "objective/non_score_reward": -0.6185390949249268, + "objective/rlhf_reward": -1.0503242506581225, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 14.155126571655273, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.525390625, + "step": 176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987331628799438 + }, + { + "episode": 2848, + "epoch": 0.017063905765059735, + "loss/policy_avg": 0.08586982637643814, + "lr": 9.886886503067486e-06, + "objective/entropy": 43.38105010986328, + "objective/kl": 24.246856689453125, + "objective/non_score_reward": -1.2123429775238037, + "objective/rlhf_reward": -3.470769503203732, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 141.50592041015625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.64453125, + "step": 177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9969239234924316 + }, + { + "episode": 2864, + "epoch": 0.017159770404189284, + "loss/policy_avg": 0.26094895601272583, + "lr": 9.886247443762783e-06, + "objective/entropy": 54.85191345214844, + "objective/kl": 20.912307739257812, + "objective/non_score_reward": -1.0456154346466064, + "objective/rlhf_reward": -2.7824616193771363, + "objective/scores": 0.35, + "policy/approxkl_avg": 19.43996810913086, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4755859375, + "step": 178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007808208465576 + }, + { + "episode": 2880, + "epoch": 0.017255635043318833, + "loss/policy_avg": -0.0008885636925697327, + "lr": 9.88560838445808e-06, + "objective/entropy": 1.5364952087402344, + "objective/kl": 18.547964096069336, + "objective/non_score_reward": -0.9273982048034668, + "objective/rlhf_reward": -1.762181530671056, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 103.84625244140625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.52734375, + "step": 179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0031652450561523 + }, + { + "episode": 2896, + "epoch": 0.017351499682448382, + "loss/policy_avg": 0.07095308601856232, + "lr": 9.884969325153375e-06, + "objective/entropy": -57.707908630371094, + "objective/kl": 17.486156463623047, + "objective/non_score_reward": -0.8743079304695129, + "objective/rlhf_reward": -1.3745254895844794, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 35.78956604003906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.63671875, + "step": 180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995075464248657 + }, + { + "episode": 2912, + "epoch": 0.01744736432157793, + "loss/policy_avg": 0.42247164249420166, + "lr": 9.884330265848671e-06, + "objective/entropy": 194.7113037109375, + "objective/kl": 21.53358268737793, + "objective/non_score_reward": -1.0766791105270386, + "objective/rlhf_reward": -2.750457256045893, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 58.89783477783203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.720703125, + "step": 181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996431827545166 + }, + { + "episode": 2928, + "epoch": 0.01754322896070748, + "loss/policy_avg": 0.3189627528190613, + "lr": 9.883691206543968e-06, + "objective/entropy": 125.43355560302734, + "objective/kl": 20.729223251342773, + "objective/non_score_reward": -1.0364612340927124, + "objective/rlhf_reward": -2.767242708293301, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 31.974578857421875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.60546875, + "step": 182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984747171401978 + }, + { + "episode": 2944, + "epoch": 0.01763909359983703, + "loss/policy_avg": 0.19416040182113647, + "lr": 9.883052147239265e-06, + "objective/entropy": 127.4957275390625, + "objective/kl": 23.107641220092773, + "objective/non_score_reward": -1.1553820371627808, + "objective/rlhf_reward": -3.2429258609689295, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 41.45734786987305, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.376953125, + "step": 183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999535322189331 + }, + { + "episode": 2960, + "epoch": 0.017734958238966578, + "loss/policy_avg": 0.04916887357831001, + "lr": 9.882413087934562e-06, + "objective/entropy": -16.33904266357422, + "objective/kl": 15.624849319458008, + "objective/non_score_reward": -0.7812424898147583, + "objective/rlhf_reward": -1.002263667360816, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 86.75860595703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8203125, + "step": 184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9967164993286133 + }, + { + "episode": 2976, + "epoch": 0.017830822878096127, + "loss/policy_avg": 0.15854808688163757, + "lr": 9.881774028629857e-06, + "objective/entropy": -9.968147277832031, + "objective/kl": 20.46514320373535, + "objective/non_score_reward": -1.0232571363449097, + "objective/rlhf_reward": -2.35969527165095, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 16.395225524902344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5859375, + "step": 185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976359605789185 + }, + { + "episode": 2992, + "epoch": 0.017926687517225676, + "loss/policy_avg": 0.36498603224754333, + "lr": 9.881134969325154e-06, + "objective/entropy": 209.59991455078125, + "objective/kl": 18.690290451049805, + "objective/non_score_reward": -0.9345145225524902, + "objective/rlhf_reward": -2.338058030605316, + "objective/scores": 0.35, + "policy/approxkl_avg": 12.64120101928711, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.623046875, + "step": 186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994118213653564 + }, + { + "episode": 3008, + "epoch": 0.018022552156355228, + "loss/policy_avg": 0.15073028206825256, + "lr": 9.880495910020451e-06, + "objective/entropy": 33.50044250488281, + "objective/kl": 21.099205017089844, + "objective/non_score_reward": -1.0549602508544922, + "objective/rlhf_reward": 0.1801587581634525, + "objective/scores": 1.1, + "policy/approxkl_avg": 28.017484664916992, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.666015625, + "step": 187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000263214111328 + }, + { + "episode": 3024, + "epoch": 0.018118416795484777, + "loss/policy_avg": 0.04914219304919243, + "lr": 9.879856850715748e-06, + "objective/entropy": 109.99685668945312, + "objective/kl": 23.795440673828125, + "objective/non_score_reward": -1.1897720098495483, + "objective/rlhf_reward": -0.3590880990028378, + "objective/scores": 1.1, + "policy/approxkl_avg": 17.797225952148438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.529296875, + "step": 188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0002386569976807 + }, + { + "episode": 3040, + "epoch": 0.018214281434614326, + "loss/policy_avg": 0.26782599091529846, + "lr": 9.879217791411043e-06, + "objective/entropy": 46.40031051635742, + "objective/kl": 15.295504570007324, + "objective/non_score_reward": -0.764775276184082, + "objective/rlhf_reward": -1.6998512086614799, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 19.033124923706055, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4287109375, + "step": 189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006394386291504 + }, + { + "episode": 3056, + "epoch": 0.018310146073743875, + "loss/policy_avg": -0.0003484562039375305, + "lr": 9.87857873210634e-06, + "objective/entropy": -128.13638305664062, + "objective/kl": 23.236797332763672, + "objective/non_score_reward": -1.1618399620056152, + "objective/rlhf_reward": -2.985500340879546, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 122.61852264404297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.521484375, + "step": 190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998673439025879 + }, + { + "episode": 3072, + "epoch": 0.018406010712873424, + "loss/policy_avg": 0.285878986120224, + "lr": 9.877939672801637e-06, + "objective/entropy": -155.79151916503906, + "objective/kl": 17.15728187561035, + "objective/non_score_reward": -0.8578640818595886, + "objective/rlhf_reward": -1.6981231282154718, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 27.024686813354492, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.578125, + "step": 191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977209568023682 + }, + { + "episode": 3088, + "epoch": 0.018501875352002973, + "loss/policy_avg": 0.03845605254173279, + "lr": 9.877300613496934e-06, + "objective/entropy": -79.23377227783203, + "objective/kl": 24.854154586791992, + "objective/non_score_reward": -1.2427077293395996, + "objective/rlhf_reward": -3.4145718505054266, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 108.08650970458984, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.54296875, + "step": 192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9965288639068604 + }, + { + "episode": 3104, + "epoch": 0.018597739991132522, + "loss/policy_avg": 0.22054271399974823, + "lr": 9.876661554192229e-06, + "objective/entropy": 58.46562576293945, + "objective/kl": 18.69571876525879, + "objective/non_score_reward": -0.9347859621047974, + "objective/rlhf_reward": -1.3391437292099, + "objective/scores": 0.6, + "policy/approxkl_avg": 17.535587310791016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996351957321167 + }, + { + "episode": 3120, + "epoch": 0.01869360463026207, + "loss/policy_avg": 0.46004775166511536, + "lr": 9.876022494887526e-06, + "objective/entropy": 208.6689453125, + "objective/kl": 24.537294387817383, + "objective/non_score_reward": -1.2268648147583008, + "objective/rlhf_reward": -3.3511998941570074, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 103.11289978027344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6171875, + "step": 194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980366230010986 + }, + { + "episode": 3136, + "epoch": 0.01878946926939162, + "loss/policy_avg": 0.14284425973892212, + "lr": 9.875383435582823e-06, + "objective/entropy": -140.25045776367188, + "objective/kl": 21.156387329101562, + "objective/non_score_reward": -1.0578192472457886, + "objective/rlhf_reward": -1.8312772423028945, + "objective/scores": 0.6, + "policy/approxkl_avg": 95.11038208007812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69921875, + "step": 195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0021564960479736 + }, + { + "episode": 3152, + "epoch": 0.01888533390852117, + "loss/policy_avg": 0.4036502540111542, + "lr": 9.87474437627812e-06, + "objective/entropy": 97.97139739990234, + "objective/kl": 20.765098571777344, + "objective/non_score_reward": -1.038254737854004, + "objective/rlhf_reward": -1.7530193686485291, + "objective/scores": 0.6, + "policy/approxkl_avg": 33.61680603027344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.75, + "step": 196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9960455894470215 + }, + { + "episode": 3168, + "epoch": 0.018981198547650718, + "loss/policy_avg": 0.03367016091942787, + "lr": 9.874105316973416e-06, + "objective/entropy": 110.7692642211914, + "objective/kl": 32.466636657714844, + "objective/non_score_reward": -1.6233320236206055, + "objective/rlhf_reward": -4.668499465259623, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 22.905399322509766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66015625, + "step": 197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000126361846924 + }, + { + "episode": 3184, + "epoch": 0.019077063186780267, + "loss/policy_avg": 0.3382406532764435, + "lr": 9.873466257668712e-06, + "objective/entropy": -46.87655258178711, + "objective/kl": 23.83783531188965, + "objective/non_score_reward": -1.1918917894363403, + "objective/rlhf_reward": -3.44205424550168, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 26.46108055114746, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4814453125, + "step": 198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9974579811096191 + }, + { + "episode": 3200, + "epoch": 0.019172927825909816, + "loss/policy_avg": 0.05052588880062103, + "lr": 9.872827198364009e-06, + "objective/entropy": -62.79549789428711, + "objective/kl": 19.587276458740234, + "objective/non_score_reward": -0.9793638586997986, + "objective/rlhf_reward": -0.9937364205133643, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.62165069580078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.564453125, + "step": 199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9972220659255981 + }, + { + "episode": 3216, + "epoch": 0.019268792465039365, + "loss/policy_avg": 0.2230260968208313, + "lr": 9.872188139059305e-06, + "objective/entropy": -37.75834655761719, + "objective/kl": 23.102069854736328, + "objective/non_score_reward": -1.1551035642623901, + "objective/rlhf_reward": -3.2787786035830075, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 56.49012756347656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.583984375, + "step": 200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000248908996582 + }, + { + "episode": 3232, + "epoch": 0.019364657104168913, + "loss/policy_avg": 0.4118785858154297, + "lr": 9.871549079754602e-06, + "objective/entropy": 85.49769592285156, + "objective/kl": 25.69809913635254, + "objective/non_score_reward": -1.284904956817627, + "objective/rlhf_reward": -3.5833605816036016, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 56.752174377441406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.66015625, + "step": 201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987632036209106 + }, + { + "episode": 3248, + "epoch": 0.019460521743298462, + "loss/policy_avg": 0.06031988561153412, + "lr": 9.8709100204499e-06, + "objective/entropy": 16.456554412841797, + "objective/kl": 25.35955047607422, + "objective/non_score_reward": -1.2679774761199951, + "objective/rlhf_reward": -3.6213118239358515, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 21.745624542236328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.529296875, + "step": 202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980382919311523 + }, + { + "episode": 3264, + "epoch": 0.01955638638242801, + "loss/policy_avg": 0.06312263011932373, + "lr": 9.870270961145196e-06, + "objective/entropy": 132.99948120117188, + "objective/kl": 22.432659149169922, + "objective/non_score_reward": -1.1216330528259277, + "objective/rlhf_reward": -2.8246725253468616, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 93.43849182128906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.568359375, + "step": 203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995737075805664 + }, + { + "episode": 3280, + "epoch": 0.01965225102155756, + "loss/policy_avg": 0.6064414978027344, + "lr": 9.869631901840491e-06, + "objective/entropy": -19.207683563232422, + "objective/kl": 18.83993148803711, + "objective/non_score_reward": -0.9419965744018555, + "objective/rlhf_reward": -2.3173880978540033, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 90.60572052001953, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4931640625, + "step": 204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000382661819458 + }, + { + "episode": 3296, + "epoch": 0.01974811566068711, + "loss/policy_avg": 0.2940763831138611, + "lr": 9.868992842535788e-06, + "objective/entropy": 83.77371978759766, + "objective/kl": 25.884700775146484, + "objective/non_score_reward": -1.2942349910736084, + "objective/rlhf_reward": -3.3521112903681507, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 39.873409271240234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.447265625, + "step": 205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972522258758545 + }, + { + "episode": 3312, + "epoch": 0.019843980299816658, + "loss/policy_avg": 0.18257562816143036, + "lr": 9.868353783231085e-06, + "objective/entropy": 119.6646728515625, + "objective/kl": 27.568458557128906, + "objective/non_score_reward": -1.3784228563308716, + "objective/rlhf_reward": -1.1136915445327755, + "objective/scores": 1.1, + "policy/approxkl_avg": 48.24208068847656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.66796875, + "step": 206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987564086914062 + }, + { + "episode": 3328, + "epoch": 0.019939844938946207, + "loss/policy_avg": -0.011964879930019379, + "lr": 9.867714723926382e-06, + "objective/entropy": 79.78416442871094, + "objective/kl": 24.409799575805664, + "objective/non_score_reward": -1.2204899787902832, + "objective/rlhf_reward": -3.5033578658975184, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 19.269145965576172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4658203125, + "step": 207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000608205795288 + }, + { + "episode": 3344, + "epoch": 0.020035709578075756, + "loss/policy_avg": 0.04908262565732002, + "lr": 9.867075664621679e-06, + "objective/entropy": 174.413818359375, + "objective/kl": 24.83539581298828, + "objective/non_score_reward": -1.241769790649414, + "objective/rlhf_reward": -3.3629594779649548, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 14.995980262756348, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.54296875, + "step": 208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985637664794922 + }, + { + "episode": 3360, + "epoch": 0.020131574217205305, + "loss/policy_avg": 0.14710021018981934, + "lr": 9.866436605316974e-06, + "objective/entropy": 132.51194763183594, + "objective/kl": 29.743432998657227, + "objective/non_score_reward": -1.4871716499328613, + "objective/rlhf_reward": -4.344566795889454, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 65.08041381835938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.490234375, + "step": 209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0021185874938965 + }, + { + "episode": 3376, + "epoch": 0.020227438856334854, + "loss/policy_avg": 0.0796532854437828, + "lr": 9.86579754601227e-06, + "objective/entropy": 1.3461151123046875, + "objective/kl": 26.279298782348633, + "objective/non_score_reward": -1.313965082168579, + "objective/rlhf_reward": -0.8558599710464474, + "objective/scores": 1.1, + "policy/approxkl_avg": 105.49284362792969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859375, + "step": 210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989545345306396 + }, + { + "episode": 3392, + "epoch": 0.020323303495464403, + "loss/policy_avg": -0.03664415329694748, + "lr": 9.865158486707568e-06, + "objective/entropy": -37.266082763671875, + "objective/kl": 19.48423957824707, + "objective/non_score_reward": -0.9742119908332825, + "objective/rlhf_reward": -0.9731288298380103, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 8.304027557373047, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.638671875, + "step": 211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.003216028213501 + }, + { + "episode": 3408, + "epoch": 0.020419168134593952, + "loss/policy_avg": 0.30985838174819946, + "lr": 9.864519427402863e-06, + "objective/entropy": 94.80859375, + "objective/kl": 29.94342041015625, + "objective/non_score_reward": -1.4971709251403809, + "objective/rlhf_reward": -4.564851482112971, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 115.7642593383789, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.74609375, + "step": 212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9959279298782349 + }, + { + "episode": 3424, + "epoch": 0.0205150327737235, + "loss/policy_avg": 0.23234406113624573, + "lr": 9.86388036809816e-06, + "objective/entropy": 125.32878875732422, + "objective/kl": 33.22450637817383, + "objective/non_score_reward": -1.6612253189086914, + "objective/rlhf_reward": -4.820072407993387, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 82.43852233886719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58203125, + "step": 213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001713752746582 + }, + { + "episode": 3440, + "epoch": 0.02061089741285305, + "loss/policy_avg": 1.5097947120666504, + "lr": 9.863241308793457e-06, + "objective/entropy": 132.66845703125, + "objective/kl": 27.622318267822266, + "objective/non_score_reward": -1.3811159133911133, + "objective/rlhf_reward": -3.6996345475044956, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 26.179336547851562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.708984375, + "step": 214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993813037872314 + }, + { + "episode": 3456, + "epoch": 0.0207067620519826, + "loss/policy_avg": 0.12209601700305939, + "lr": 9.862602249488753e-06, + "objective/entropy": 132.88406372070312, + "objective/kl": 26.24971580505371, + "objective/non_score_reward": -1.312485694885254, + "objective/rlhf_reward": -5.249942898750305, + "objective/scores": 0.0, + "policy/approxkl_avg": 41.524139404296875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7109375, + "step": 215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990661144256592 + }, + { + "episode": 3472, + "epoch": 0.02080262669111215, + "loss/policy_avg": 0.3654727339744568, + "lr": 9.86196319018405e-06, + "objective/entropy": 39.344974517822266, + "objective/kl": 23.619754791259766, + "objective/non_score_reward": -1.18098783493042, + "objective/rlhf_reward": -1.8002320870172706, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.19040584564209, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4951171875, + "step": 216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990694522857666 + }, + { + "episode": 3488, + "epoch": 0.0208984913302417, + "loss/policy_avg": 0.05907230079174042, + "lr": 9.861324130879346e-06, + "objective/entropy": -49.055564880371094, + "objective/kl": 27.70423126220703, + "objective/non_score_reward": -1.3852115869522095, + "objective/rlhf_reward": -3.8789869598752125, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 62.16511917114258, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.609375, + "step": 217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973804950714111 + }, + { + "episode": 3504, + "epoch": 0.02099435596937125, + "loss/policy_avg": 0.5758800506591797, + "lr": 9.860685071574642e-06, + "objective/entropy": 18.1787166595459, + "objective/kl": 25.688358306884766, + "objective/non_score_reward": -1.2844178676605225, + "objective/rlhf_reward": -2.2139523147952285, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 23.39984130859375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.498046875, + "step": 218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974064826965332 + }, + { + "episode": 3520, + "epoch": 0.021090220608500798, + "loss/policy_avg": 0.2610527575016022, + "lr": 9.86004601226994e-06, + "objective/entropy": -68.09791564941406, + "objective/kl": 26.7615966796875, + "objective/non_score_reward": -1.3380796909332275, + "objective/rlhf_reward": -4.026806149512453, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 124.13450622558594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4599609375, + "step": 219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986741542816162 + }, + { + "episode": 3536, + "epoch": 0.021186085247630347, + "loss/policy_avg": 0.1624567210674286, + "lr": 9.859406952965236e-06, + "objective/entropy": -113.99856567382812, + "objective/kl": 19.689868927001953, + "objective/non_score_reward": -0.9844935536384583, + "objective/rlhf_reward": -2.113145466121744, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 45.295875549316406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0004196166992188 + }, + { + "episode": 3552, + "epoch": 0.021281949886759896, + "loss/policy_avg": 0.13548433780670166, + "lr": 9.858767893660533e-06, + "objective/entropy": 154.66708374023438, + "objective/kl": 31.08365249633789, + "objective/non_score_reward": -1.554182529449463, + "objective/rlhf_reward": -4.554870968282805, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 43.560997009277344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7265625, + "step": 221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972419738769531 + }, + { + "episode": 3568, + "epoch": 0.021377814525889445, + "loss/policy_avg": 0.04025420919060707, + "lr": 9.858128834355828e-06, + "objective/entropy": 145.02468872070312, + "objective/kl": 31.459678649902344, + "objective/non_score_reward": -1.572983980178833, + "objective/rlhf_reward": -4.932686292861385, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 41.05935287475586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4560546875, + "step": 222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0009706020355225 + }, + { + "episode": 3584, + "epoch": 0.021473679165018994, + "loss/policy_avg": 1.5885295867919922, + "lr": 9.857489775051125e-06, + "objective/entropy": 141.5781707763672, + "objective/kl": 34.53314971923828, + "objective/non_score_reward": -1.726657509803772, + "objective/rlhf_reward": -5.244770532072174, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 37.03607177734375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.658203125, + "step": 223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.992570400238037 + }, + { + "episode": 3600, + "epoch": 0.021569543804148543, + "loss/policy_avg": 0.9811650514602661, + "lr": 9.856850715746422e-06, + "objective/entropy": -30.946441650390625, + "objective/kl": 29.145998001098633, + "objective/non_score_reward": -1.4572999477386475, + "objective/rlhf_reward": -4.450597622481686, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 19.481060028076172, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.462890625, + "step": 224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983983039855957 + }, + { + "episode": 3616, + "epoch": 0.021665408443278092, + "loss/policy_avg": 0.5196128487586975, + "lr": 9.856211656441719e-06, + "objective/entropy": -16.55962371826172, + "objective/kl": 28.4706974029541, + "objective/non_score_reward": -1.423534870147705, + "objective/rlhf_reward": -3.5714332482972484, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 117.12289428710938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.732421875, + "step": 225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975980520248413 + }, + { + "episode": 3632, + "epoch": 0.02176127308240764, + "loss/policy_avg": 0.6528609395027161, + "lr": 9.855572597137016e-06, + "objective/entropy": 136.64077758789062, + "objective/kl": 32.46646499633789, + "objective/non_score_reward": -1.6233232021331787, + "objective/rlhf_reward": -2.093292927742004, + "objective/scores": 1.1, + "policy/approxkl_avg": 44.35145950317383, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.669921875, + "step": 226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994146823883057 + }, + { + "episode": 3648, + "epoch": 0.02185713772153719, + "loss/policy_avg": 0.9434906244277954, + "lr": 9.854933537832313e-06, + "objective/entropy": -36.75615310668945, + "objective/kl": 31.890575408935547, + "objective/non_score_reward": -1.5945286750793457, + "objective/rlhf_reward": -5.052601966887636, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 65.19577026367188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.59375, + "step": 227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979374408721924 + }, + { + "episode": 3664, + "epoch": 0.02195300236066674, + "loss/policy_avg": 0.36130765080451965, + "lr": 9.854294478527608e-06, + "objective/entropy": 47.61101531982422, + "objective/kl": 18.669593811035156, + "objective/non_score_reward": -0.9334796071052551, + "objective/rlhf_reward": -2.3339184284210206, + "objective/scores": 0.35, + "policy/approxkl_avg": 15.266149520874023, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.578125, + "step": 228, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9959979057312012 + }, + { + "episode": 3680, + "epoch": 0.022048866999796288, + "loss/policy_avg": 0.18321090936660767, + "lr": 9.853655419222905e-06, + "objective/entropy": 116.60293579101562, + "objective/kl": 27.56112289428711, + "objective/non_score_reward": -1.378056287765503, + "objective/rlhf_reward": -3.5648136837052657, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 29.471284866333008, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.537109375, + "step": 229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991540908813477 + }, + { + "episode": 3696, + "epoch": 0.022144731638925837, + "loss/policy_avg": -0.044996485114097595, + "lr": 9.853016359918202e-06, + "objective/entropy": 38.275238037109375, + "objective/kl": 28.720836639404297, + "objective/non_score_reward": -1.4360418319702148, + "objective/rlhf_reward": -4.187907754388407, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 173.6102752685547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.552734375, + "step": 230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997882604598999 + }, + { + "episode": 3712, + "epoch": 0.022240596278055386, + "loss/policy_avg": 0.027855467051267624, + "lr": 9.852377300613498e-06, + "objective/entropy": 123.59611511230469, + "objective/kl": 30.175601959228516, + "objective/non_score_reward": -1.5087801218032837, + "objective/rlhf_reward": -4.478861062732294, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 50.733642578125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.37109375, + "step": 231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003466606140137 + }, + { + "episode": 3728, + "epoch": 0.022336460917184935, + "loss/policy_avg": -0.3093503713607788, + "lr": 9.851738241308795e-06, + "objective/entropy": 0.438995361328125, + "objective/kl": 27.025171279907227, + "objective/non_score_reward": -1.3512585163116455, + "objective/rlhf_reward": -5.405034303665161, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.092641830444336, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.615234375, + "step": 232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000675678253174 + }, + { + "episode": 3744, + "epoch": 0.022432325556314484, + "loss/policy_avg": -0.05236402899026871, + "lr": 9.85109918200409e-06, + "objective/entropy": 112.74819946289062, + "objective/kl": 24.94538688659668, + "objective/non_score_reward": -1.2472693920135498, + "objective/rlhf_reward": -3.473305845054325, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 19.200075149536133, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3583984375, + "step": 233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002117395401001 + }, + { + "episode": 3760, + "epoch": 0.022528190195444033, + "loss/policy_avg": 0.21103611588478088, + "lr": 9.850460122699387e-06, + "objective/entropy": 73.77043151855469, + "objective/kl": 28.00216293334961, + "objective/non_score_reward": -1.4001080989837646, + "objective/rlhf_reward": -3.6530211669968917, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 13.291183471679688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5009765625, + "step": 234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995331764221191 + }, + { + "episode": 3776, + "epoch": 0.02262405483457358, + "loss/policy_avg": 0.6418443918228149, + "lr": 9.849821063394683e-06, + "objective/entropy": 19.92426300048828, + "objective/kl": 31.282997131347656, + "objective/non_score_reward": -1.5641499757766724, + "objective/rlhf_reward": -4.931086901456041, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 98.59768676757812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.34375, + "step": 235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0005524158477783 + }, + { + "episode": 3792, + "epoch": 0.02271991947370313, + "loss/policy_avg": 0.20836295187473297, + "lr": 9.84918200408998e-06, + "objective/entropy": 28.238201141357422, + "objective/kl": 29.105060577392578, + "objective/non_score_reward": -1.455253005027771, + "objective/rlhf_reward": -4.264752714839533, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 34.374176025390625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.51171875, + "step": 236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989919662475586 + }, + { + "episode": 3808, + "epoch": 0.02281578411283268, + "loss/policy_avg": 0.43571943044662476, + "lr": 9.848542944785276e-06, + "objective/entropy": 144.94302368164062, + "objective/kl": 33.369178771972656, + "objective/non_score_reward": -1.6684589385986328, + "objective/rlhf_reward": -5.314585768912716, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 113.68771362304688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.607421875, + "step": 237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996192216873169 + }, + { + "episode": 3824, + "epoch": 0.02291164875196223, + "loss/policy_avg": 0.14893671870231628, + "lr": 9.847903885480573e-06, + "objective/entropy": 186.38681030273438, + "objective/kl": 41.077842712402344, + "objective/non_score_reward": -2.0538923740386963, + "objective/rlhf_reward": -6.611449215475636, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 168.3666229248047, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.755859375, + "step": 238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984937906265259 + }, + { + "episode": 3840, + "epoch": 0.023007513391091777, + "loss/policy_avg": 0.07648584991693497, + "lr": 9.84726482617587e-06, + "objective/entropy": -37.23631286621094, + "objective/kl": 25.318248748779297, + "objective/non_score_reward": -1.2659125328063965, + "objective/rlhf_reward": -3.5073907067447454, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 50.266414642333984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48828125, + "step": 239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979995489120483 + }, + { + "episode": 3856, + "epoch": 0.023103378030221326, + "loss/policy_avg": -0.15926438570022583, + "lr": 9.846625766871167e-06, + "objective/entropy": 37.868736267089844, + "objective/kl": 27.493305206298828, + "objective/non_score_reward": -1.3746652603149414, + "objective/rlhf_reward": -4.173148546248598, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 6.63505220413208, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5390625, + "step": 240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0081896781921387 + }, + { + "episode": 3872, + "epoch": 0.023199242669350875, + "loss/policy_avg": 0.14562831819057465, + "lr": 9.845986707566462e-06, + "objective/entropy": 15.188220977783203, + "objective/kl": 28.046958923339844, + "objective/non_score_reward": -1.4023480415344238, + "objective/rlhf_reward": -4.1587937875703425, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 43.238990783691406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.578125, + "step": 241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996706247329712 + }, + { + "episode": 3888, + "epoch": 0.023295107308480424, + "loss/policy_avg": 0.11054911464452744, + "lr": 9.845347648261759e-06, + "objective/entropy": 65.03858947753906, + "objective/kl": 30.087387084960938, + "objective/non_score_reward": -1.5043694972991943, + "objective/rlhf_reward": -4.070066402630742, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 3.83949613571167, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.587890625, + "step": 242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988956451416016 + }, + { + "episode": 3904, + "epoch": 0.023390971947609973, + "loss/policy_avg": 0.3941475749015808, + "lr": 9.844708588957056e-06, + "objective/entropy": 59.93316650390625, + "objective/kl": 25.623512268066406, + "objective/non_score_reward": -1.2811756134033203, + "objective/rlhf_reward": -3.52058264977129, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 78.30380249023438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5859375, + "step": 243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990283250808716 + }, + { + "episode": 3920, + "epoch": 0.023486836586739522, + "loss/policy_avg": 0.19095474481582642, + "lr": 9.844069529652353e-06, + "objective/entropy": 31.422988891601562, + "objective/kl": 24.865825653076172, + "objective/non_score_reward": -1.2432913780212402, + "objective/rlhf_reward": -3.2398319403330484, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 38.12981033325195, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.53125, + "step": 244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.004727840423584 + }, + { + "episode": 3936, + "epoch": 0.023582701225869074, + "loss/policy_avg": 0.049357250332832336, + "lr": 9.84343047034765e-06, + "objective/entropy": 21.297576904296875, + "objective/kl": 35.60150146484375, + "objective/non_score_reward": -1.7800750732421875, + "objective/rlhf_reward": -5.720300531387329, + "objective/scores": 0.35, + "policy/approxkl_avg": 38.869449615478516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4716796875, + "step": 245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0019991397857666 + }, + { + "episode": 3952, + "epoch": 0.023678565864998623, + "loss/policy_avg": 0.7713517546653748, + "lr": 9.842791411042945e-06, + "objective/entropy": 53.62720489501953, + "objective/kl": 31.218942642211914, + "objective/non_score_reward": -1.5609471797943115, + "objective/rlhf_reward": -4.296377490239079, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 48.73869323730469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7265625, + "step": 246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975836277008057 + }, + { + "episode": 3968, + "epoch": 0.023774430504128172, + "loss/policy_avg": 0.008143262937664986, + "lr": 9.842152351738242e-06, + "objective/entropy": 171.02789306640625, + "objective/kl": 34.79176330566406, + "objective/non_score_reward": -1.7395880222320557, + "objective/rlhf_reward": -5.296492939413176, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 21.7828369140625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.57421875, + "step": 247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989376068115234 + }, + { + "episode": 3984, + "epoch": 0.02387029514325772, + "loss/policy_avg": -0.12264247238636017, + "lr": 9.841513292433539e-06, + "objective/entropy": 80.24577331542969, + "objective/kl": 33.11949920654297, + "objective/non_score_reward": -1.6559748649597168, + "objective/rlhf_reward": -4.799071069034646, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 61.87395477294922, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4599609375, + "step": 248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003005027770996 + }, + { + "episode": 4000, + "epoch": 0.02396615978238727, + "loss/policy_avg": 0.2658330202102661, + "lr": 9.840874233128836e-06, + "objective/entropy": 149.58941650390625, + "objective/kl": 29.3863525390625, + "objective/non_score_reward": -1.4693175554275513, + "objective/rlhf_reward": -4.273150358263569, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 58.66055679321289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51171875, + "step": 249, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972658157348633 + }, + { + "episode": 4016, + "epoch": 0.02406202442151682, + "loss/policy_avg": 0.09115779399871826, + "lr": 9.840235173824132e-06, + "objective/entropy": 147.28927612304688, + "objective/kl": 31.492679595947266, + "objective/non_score_reward": -1.5746338367462158, + "objective/rlhf_reward": -4.939285838340206, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 28.799278259277344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.796875, + "step": 250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002530574798584 + }, + { + "episode": 4032, + "epoch": 0.024157889060646368, + "loss/policy_avg": 0.09398385882377625, + "lr": 9.83959611451943e-06, + "objective/entropy": -45.248435974121094, + "objective/kl": 28.402175903320312, + "objective/non_score_reward": -1.4201087951660156, + "objective/rlhf_reward": -4.018575882137405, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 19.838550567626953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.517578125, + "step": 251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973764419555664 + }, + { + "episode": 4048, + "epoch": 0.024253753699775917, + "loss/policy_avg": 0.19270983338356018, + "lr": 9.838957055214724e-06, + "objective/entropy": 77.1705093383789, + "objective/kl": 34.050987243652344, + "objective/non_score_reward": -1.7025493383407593, + "objective/rlhf_reward": -5.076863960425058, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 18.725093841552734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4814453125, + "step": 252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001063823699951 + }, + { + "episode": 4064, + "epoch": 0.024349618338905466, + "loss/policy_avg": 0.4652649164199829, + "lr": 9.838317995910021e-06, + "objective/entropy": 257.7345886230469, + "objective/kl": 24.133747100830078, + "objective/non_score_reward": -1.2066874504089355, + "objective/rlhf_reward": -3.4481475735581935, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 41.46368408203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.80078125, + "step": 253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9955878257751465 + }, + { + "episode": 4080, + "epoch": 0.024445482978035015, + "loss/policy_avg": 0.14692571759223938, + "lr": 9.837678936605318e-06, + "objective/entropy": 43.00188064575195, + "objective/kl": 24.73518180847168, + "objective/non_score_reward": -1.236759066581726, + "objective/rlhf_reward": -3.568433978644711, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 75.05264282226562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990639686584473 + }, + { + "episode": 4096, + "epoch": 0.024541347617164564, + "loss/policy_avg": 0.08271847665309906, + "lr": 9.837039877300615e-06, + "objective/entropy": -79.57066345214844, + "objective/kl": 26.90784454345703, + "objective/non_score_reward": -1.3453922271728516, + "objective/rlhf_reward": -3.648235575358073, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 24.23294448852539, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619140625, + "step": 255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984873533248901 + }, + { + "episode": 4112, + "epoch": 0.024637212256294113, + "loss/policy_avg": 0.12403183430433273, + "lr": 9.83640081799591e-06, + "objective/entropy": 87.87326049804688, + "objective/kl": 29.708419799804688, + "objective/non_score_reward": -1.4854209423065186, + "objective/rlhf_reward": -4.116855438026499, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 32.65428161621094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62890625, + "step": 256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981478452682495 + }, + { + "episode": 4128, + "epoch": 0.024733076895423662, + "loss/policy_avg": -0.17764857411384583, + "lr": 9.835761758691207e-06, + "objective/entropy": 130.6345977783203, + "objective/kl": 34.35237121582031, + "objective/non_score_reward": -1.717618465423584, + "objective/rlhf_reward": -5.314214794841364, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 118.99533081054688, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.3974609375, + "step": 257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.014057159423828 + }, + { + "episode": 4144, + "epoch": 0.02482894153455321, + "loss/policy_avg": 2.400163173675537, + "lr": 9.835122699386504e-06, + "objective/entropy": 123.72301483154297, + "objective/kl": 21.25601577758789, + "objective/non_score_reward": -1.0628007650375366, + "objective/rlhf_reward": 0.1487968802452091, + "objective/scores": 1.1, + "policy/approxkl_avg": 36.07887268066406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.572265625, + "step": 258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998108148574829 + }, + { + "episode": 4160, + "epoch": 0.02492480617368276, + "loss/policy_avg": 0.3900964856147766, + "lr": 9.8344836400818e-06, + "objective/entropy": 233.3748321533203, + "objective/kl": 42.447425842285156, + "objective/non_score_reward": -2.1223714351654053, + "objective/rlhf_reward": -5.5657667263757915, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 19.722026824951172, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.74609375, + "step": 259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000584363937378 + }, + { + "episode": 4176, + "epoch": 0.02502067081281231, + "loss/policy_avg": 0.3361247181892395, + "lr": 9.833844580777096e-06, + "objective/entropy": 135.13961791992188, + "objective/kl": 31.25783920288086, + "objective/non_score_reward": -1.5628920793533325, + "objective/rlhf_reward": -4.426739449771952, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 16.49414825439453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.486328125, + "step": 260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986810684204102 + }, + { + "episode": 4192, + "epoch": 0.025116535451941858, + "loss/policy_avg": 0.1438344419002533, + "lr": 9.833205521472393e-06, + "objective/entropy": 104.18168640136719, + "objective/kl": 35.72525405883789, + "objective/non_score_reward": -1.7862627506256104, + "objective/rlhf_reward": -5.320222015651773, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 22.100770950317383, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65625, + "step": 261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996044635772705 + }, + { + "episode": 4208, + "epoch": 0.025212400091071407, + "loss/policy_avg": 2.402132034301758, + "lr": 9.83256646216769e-06, + "objective/entropy": 91.16908264160156, + "objective/kl": 29.633235931396484, + "objective/non_score_reward": -1.4816619157791138, + "objective/rlhf_reward": -4.476049522967681, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 43.586891174316406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.64453125, + "step": 262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.004427433013916 + }, + { + "episode": 4224, + "epoch": 0.025308264730200956, + "loss/policy_avg": 0.7259080410003662, + "lr": 9.831927402862987e-06, + "objective/entropy": 154.68115234375, + "objective/kl": 37.00696563720703, + "objective/non_score_reward": -1.8503483533859253, + "objective/rlhf_reward": -5.576564307483743, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 16.052043914794922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.484375, + "step": 263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974215030670166 + }, + { + "episode": 4240, + "epoch": 0.025404129369330505, + "loss/policy_avg": 0.09373458474874496, + "lr": 9.831288343558284e-06, + "objective/entropy": 72.85606384277344, + "objective/kl": 27.522302627563477, + "objective/non_score_reward": -1.376115083694458, + "objective/rlhf_reward": -3.679631943973612, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 142.1138916015625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.310546875, + "step": 264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991629123687744 + }, + { + "episode": 4256, + "epoch": 0.025499994008460054, + "loss/policy_avg": 0.7555310130119324, + "lr": 9.830649284253579e-06, + "objective/entropy": 72.61222076416016, + "objective/kl": 30.647029876708984, + "objective/non_score_reward": -1.5323514938354492, + "objective/rlhf_reward": -4.705573756893244, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 54.394874572753906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.587890625, + "step": 265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006964206695557 + }, + { + "episode": 4272, + "epoch": 0.025595858647589603, + "loss/policy_avg": 0.6551899313926697, + "lr": 9.830010224948876e-06, + "objective/entropy": 121.19924926757812, + "objective/kl": 33.96527099609375, + "objective/non_score_reward": -1.6982636451721191, + "objective/rlhf_reward": -5.131195192754852, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 40.39656066894531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.470703125, + "step": 266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999192476272583 + }, + { + "episode": 4288, + "epoch": 0.02569172328671915, + "loss/policy_avg": 1.1016074419021606, + "lr": 9.829371165644173e-06, + "objective/entropy": 132.00601196289062, + "objective/kl": 43.09049987792969, + "objective/non_score_reward": -2.154524803161621, + "objective/rlhf_reward": -7.102327191623386, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 126.27546691894531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3935546875, + "step": 267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990718364715576 + }, + { + "episode": 4304, + "epoch": 0.0257875879258487, + "loss/policy_avg": 0.08981708437204361, + "lr": 9.82873210633947e-06, + "objective/entropy": 140.80239868164062, + "objective/kl": 26.626178741455078, + "objective/non_score_reward": -1.3313090801239014, + "objective/rlhf_reward": -0.9252360224723812, + "objective/scores": 1.1, + "policy/approxkl_avg": 84.53665924072266, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.841796875, + "step": 268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997150182723999 + }, + { + "episode": 4320, + "epoch": 0.02588345256497825, + "loss/policy_avg": 0.565528929233551, + "lr": 9.828093047034766e-06, + "objective/entropy": 138.6593017578125, + "objective/kl": 32.08763885498047, + "objective/non_score_reward": -1.604382038116455, + "objective/rlhf_reward": -4.813408408228474, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 34.42543029785156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.427734375, + "step": 269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0018558502197266 + }, + { + "episode": 4336, + "epoch": 0.0259793172041078, + "loss/policy_avg": 0.4312899708747864, + "lr": 9.827453987730061e-06, + "objective/entropy": 20.17654800415039, + "objective/kl": 23.528181076049805, + "objective/non_score_reward": -1.176409125328064, + "objective/rlhf_reward": -2.5829304478326183, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 20.440711975097656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7890625, + "step": 270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989676475524902 + }, + { + "episode": 4352, + "epoch": 0.026075181843237347, + "loss/policy_avg": 0.20729105174541473, + "lr": 9.826814928425358e-06, + "objective/entropy": 166.21115112304688, + "objective/kl": 31.01326560974121, + "objective/non_score_reward": -1.5506633520126343, + "objective/rlhf_reward": -6.202653288841248, + "objective/scores": 0.0, + "policy/approxkl_avg": 34.41830825805664, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003887176513672 + }, + { + "episode": 4368, + "epoch": 0.026171046482366896, + "loss/policy_avg": 3.2944061756134033, + "lr": 9.826175869120655e-06, + "objective/entropy": 28.755096435546875, + "objective/kl": 31.482175827026367, + "objective/non_score_reward": -1.5741088390350342, + "objective/rlhf_reward": -4.917833187667233, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 5.366632461547852, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3701171875, + "step": 272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0053317546844482 + }, + { + "episode": 4384, + "epoch": 0.02626691112149645, + "loss/policy_avg": 0.23004142940044403, + "lr": 9.825536809815952e-06, + "objective/entropy": 54.82402038574219, + "objective/kl": 32.45307922363281, + "objective/non_score_reward": -1.6226541996002197, + "objective/rlhf_reward": -5.148980966120391, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 31.775432586669922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.525390625, + "step": 273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995795488357544 + }, + { + "episode": 4400, + "epoch": 0.026362775760625998, + "loss/policy_avg": -0.08435960114002228, + "lr": 9.824897750511249e-06, + "objective/entropy": 98.25897216796875, + "objective/kl": 28.68474578857422, + "objective/non_score_reward": -1.4342372417449951, + "objective/rlhf_reward": -5.73694920539856, + "objective/scores": 0.0, + "policy/approxkl_avg": 72.97157287597656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.517578125, + "step": 274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0012598037719727 + }, + { + "episode": 4416, + "epoch": 0.026458640399755547, + "loss/policy_avg": 0.41626134514808655, + "lr": 9.824258691206546e-06, + "objective/entropy": 83.60694885253906, + "objective/kl": 30.977035522460938, + "objective/non_score_reward": -1.548851728439331, + "objective/rlhf_reward": -4.795407152175903, + "objective/scores": 0.35, + "policy/approxkl_avg": 39.04691696166992, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.60546875, + "step": 275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0021119117736816 + }, + { + "episode": 4432, + "epoch": 0.026554505038885096, + "loss/policy_avg": 0.43957769870758057, + "lr": 9.823619631901841e-06, + "objective/entropy": 127.34529113769531, + "objective/kl": 35.28544616699219, + "objective/non_score_reward": -1.7642724514007568, + "objective/rlhf_reward": -5.606491903872833, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 150.78646850585938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.748046875, + "step": 276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9970619678497314 + }, + { + "episode": 4448, + "epoch": 0.026650369678014645, + "loss/policy_avg": 0.8086847066879272, + "lr": 9.822980572597138e-06, + "objective/entropy": -119.74644470214844, + "objective/kl": 26.706302642822266, + "objective/non_score_reward": -1.335315227508545, + "objective/rlhf_reward": -3.9412606716156002, + "objective/scores": 0.35, + "policy/approxkl_avg": 65.78569793701172, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.568359375, + "step": 277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986886978149414 + }, + { + "episode": 4464, + "epoch": 0.026746234317144194, + "loss/policy_avg": 0.09760895371437073, + "lr": 9.822341513292433e-06, + "objective/entropy": 209.31890869140625, + "objective/kl": 41.666831970214844, + "objective/non_score_reward": -2.083341598510742, + "objective/rlhf_reward": -6.7292466498056225, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 14.525606155395508, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980053901672363 + }, + { + "episode": 4480, + "epoch": 0.026842098956273742, + "loss/policy_avg": 0.0820450559258461, + "lr": 9.82170245398773e-06, + "objective/entropy": 152.01095581054688, + "objective/kl": 29.104724884033203, + "objective/non_score_reward": -1.4552361965179443, + "objective/rlhf_reward": -4.159085219324218, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 21.12679100036621, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4560546875, + "step": 279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99649977684021 + }, + { + "episode": 4496, + "epoch": 0.02693796359540329, + "loss/policy_avg": 0.08112587034702301, + "lr": 9.821063394683027e-06, + "objective/entropy": 49.22539138793945, + "objective/kl": 32.40191650390625, + "objective/non_score_reward": -1.6200958490371704, + "objective/rlhf_reward": -5.029785375209197, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 4.874902725219727, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.404296875, + "step": 280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0027780532836914 + }, + { + "episode": 4512, + "epoch": 0.02703382823453284, + "loss/policy_avg": 0.41851094365119934, + "lr": 9.820424335378324e-06, + "objective/entropy": 108.13827514648438, + "objective/kl": 44.792015075683594, + "objective/non_score_reward": -2.239600658416748, + "objective/rlhf_reward": -7.133574362072061, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 67.72032165527344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.537109375, + "step": 281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979641437530518 + }, + { + "episode": 4528, + "epoch": 0.02712969287366239, + "loss/policy_avg": 0.8327301144599915, + "lr": 9.81978527607362e-06, + "objective/entropy": 70.98486328125, + "objective/kl": 43.82145690917969, + "objective/non_score_reward": -2.191072702407837, + "objective/rlhf_reward": -7.283338430340647, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 1.1268585920333862, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.427734375, + "step": 282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002413749694824 + }, + { + "episode": 4544, + "epoch": 0.02722555751279194, + "loss/policy_avg": 0.26003268361091614, + "lr": 9.819146216768916e-06, + "objective/entropy": 59.813140869140625, + "objective/kl": 32.33997344970703, + "objective/non_score_reward": -1.6169987916946411, + "objective/rlhf_reward": -4.643166418346476, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 108.00172424316406, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.60546875, + "step": 283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996032953262329 + }, + { + "episode": 4560, + "epoch": 0.027321422151921487, + "loss/policy_avg": 0.06828334182500839, + "lr": 9.818507157464213e-06, + "objective/entropy": 164.7733154296875, + "objective/kl": 36.976539611816406, + "objective/non_score_reward": -1.8488272428512573, + "objective/rlhf_reward": -5.791188750330525, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 22.712989807128906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.498046875, + "step": 284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998927354812622 + }, + { + "episode": 4576, + "epoch": 0.027417286791051036, + "loss/policy_avg": 0.346102774143219, + "lr": 9.81786809815951e-06, + "objective/entropy": 141.91213989257812, + "objective/kl": 29.89690589904785, + "objective/non_score_reward": -1.4948452711105347, + "objective/rlhf_reward": -4.5793810248374935, + "objective/scores": 0.35, + "policy/approxkl_avg": 4.914261817932129, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.583984375, + "step": 285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991732835769653 + }, + { + "episode": 4592, + "epoch": 0.027513151430180585, + "loss/policy_avg": 0.07111110538244247, + "lr": 9.817229038854806e-06, + "objective/entropy": -41.44879150390625, + "objective/kl": 29.296417236328125, + "objective/non_score_reward": -1.4648208618164062, + "objective/rlhf_reward": -4.4806815172113, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 70.16557312011719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5625, + "step": 286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982048273086548 + }, + { + "episode": 4608, + "epoch": 0.027609016069310134, + "loss/policy_avg": 0.6204440593719482, + "lr": 9.816589979550103e-06, + "objective/entropy": 10.609687805175781, + "objective/kl": 34.5562744140625, + "objective/non_score_reward": -1.727813720703125, + "objective/rlhf_reward": -5.552005314563198, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 44.11948776245117, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4677734375, + "step": 287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960062503814697 + }, + { + "episode": 4624, + "epoch": 0.027704880708439683, + "loss/policy_avg": -0.3703474700450897, + "lr": 9.8159509202454e-06, + "objective/entropy": 16.20748519897461, + "objective/kl": 40.348899841308594, + "objective/non_score_reward": -2.0174450874328613, + "objective/rlhf_reward": -5.6697804689407345, + "objective/scores": 0.6, + "policy/approxkl_avg": 58.94084167480469, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.451171875, + "step": 288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000488519668579 + }, + { + "episode": 4640, + "epoch": 0.027800745347569232, + "loss/policy_avg": 0.691341757774353, + "lr": 9.815311860940695e-06, + "objective/entropy": 164.64894104003906, + "objective/kl": 35.96034240722656, + "objective/non_score_reward": -1.7980170249938965, + "objective/rlhf_reward": -2.792067980766296, + "objective/scores": 1.1, + "policy/approxkl_avg": 105.621826171875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.53125, + "step": 289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972904920578003 + }, + { + "episode": 4656, + "epoch": 0.02789660998669878, + "loss/policy_avg": 0.05122673511505127, + "lr": 9.814672801635992e-06, + "objective/entropy": 143.17758178710938, + "objective/kl": 27.651023864746094, + "objective/non_score_reward": -1.3825511932373047, + "objective/rlhf_reward": -2.6064857586633887, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 30.806257247924805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.462890625, + "step": 290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996225118637085 + }, + { + "episode": 4672, + "epoch": 0.02799247462582833, + "loss/policy_avg": -0.021466929465532303, + "lr": 9.81403374233129e-06, + "objective/entropy": 123.44010925292969, + "objective/kl": 18.645748138427734, + "objective/non_score_reward": -0.9322873950004578, + "objective/rlhf_reward": -2.403636608153505, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 24.915597915649414, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.638671875, + "step": 291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.995213270187378 + }, + { + "episode": 4688, + "epoch": 0.02808833926495788, + "loss/policy_avg": 0.700859785079956, + "lr": 9.813394683026586e-06, + "objective/entropy": 58.48292922973633, + "objective/kl": 28.2305965423584, + "objective/non_score_reward": -1.411529779434204, + "objective/rlhf_reward": -4.24611941576004, + "objective/scores": 0.35, + "policy/approxkl_avg": 21.04977035522461, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4208984375, + "step": 292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981340169906616 + }, + { + "episode": 4704, + "epoch": 0.028184203904087428, + "loss/policy_avg": 0.9605820775032043, + "lr": 9.812755623721883e-06, + "objective/entropy": -33.6519775390625, + "objective/kl": 33.635501861572266, + "objective/non_score_reward": -1.6817750930786133, + "objective/rlhf_reward": -5.065241103590118, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 18.019363403320312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4619140625, + "step": 293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000504493713379 + }, + { + "episode": 4720, + "epoch": 0.028280068543216977, + "loss/policy_avg": 0.44443511962890625, + "lr": 9.81211656441718e-06, + "objective/entropy": 61.81305694580078, + "objective/kl": 37.54548263549805, + "objective/non_score_reward": -1.8772742748260498, + "objective/rlhf_reward": -6.130494453994137, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 34.736690521240234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.701171875, + "step": 294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981006383895874 + }, + { + "episode": 4736, + "epoch": 0.028375933182346526, + "loss/policy_avg": -0.004817202687263489, + "lr": 9.811477505112475e-06, + "objective/entropy": -85.25079345703125, + "objective/kl": 22.125272750854492, + "objective/non_score_reward": -1.1062637567520142, + "objective/rlhf_reward": -3.0658050415262412, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 39.945377349853516, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.619140625, + "step": 295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001112937927246 + }, + { + "episode": 4752, + "epoch": 0.028471797821476075, + "loss/policy_avg": -0.018911486491560936, + "lr": 9.810838445807772e-06, + "objective/entropy": 187.50953674316406, + "objective/kl": 31.752737045288086, + "objective/non_score_reward": -1.587636947631836, + "objective/rlhf_reward": -4.525718684467386, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 41.095298767089844, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.560546875, + "step": 296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0318245887756348 + }, + { + "episode": 4768, + "epoch": 0.028567662460605624, + "loss/policy_avg": 0.5813855528831482, + "lr": 9.810199386503069e-06, + "objective/entropy": 13.395767211914062, + "objective/kl": 29.76428985595703, + "objective/non_score_reward": -1.4882144927978516, + "objective/rlhf_reward": -4.219524757067362, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 58.40808868408203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62109375, + "step": 297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971027374267578 + }, + { + "episode": 4784, + "epoch": 0.028663527099735173, + "loss/policy_avg": 0.25174012780189514, + "lr": 9.809560327198366e-06, + "objective/entropy": 93.99857330322266, + "objective/kl": 31.07823944091797, + "objective/non_score_reward": -1.5539120435714722, + "objective/rlhf_reward": -4.482314721743266, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 56.219329833984375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.767578125, + "step": 298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973026514053345 + }, + { + "episode": 4800, + "epoch": 0.02875939173886472, + "loss/policy_avg": -0.05966740474104881, + "lr": 9.808921267893663e-06, + "objective/entropy": 199.3701934814453, + "objective/kl": 26.15532684326172, + "objective/non_score_reward": -1.3077664375305176, + "objective/rlhf_reward": -3.7152936098896827, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 18.272422790527344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.650390625, + "step": 299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002091646194458 + }, + { + "episode": 4816, + "epoch": 0.02885525637799427, + "loss/policy_avg": 0.19725301861763, + "lr": 9.808282208588958e-06, + "objective/entropy": 112.11613464355469, + "objective/kl": 33.344722747802734, + "objective/non_score_reward": -1.667236089706421, + "objective/rlhf_reward": -6.668944478034973, + "objective/scores": 0.0, + "policy/approxkl_avg": 29.54242706298828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0038766860961914 + }, + { + "episode": 4832, + "epoch": 0.02895112101712382, + "loss/policy_avg": -0.17506346106529236, + "lr": 9.807643149284255e-06, + "objective/entropy": 70.48281860351562, + "objective/kl": 29.51511573791504, + "objective/non_score_reward": -1.4757558107376099, + "objective/rlhf_reward": -4.387251400741276, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 12.791141510009766, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4814453125, + "step": 301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999895691871643 + }, + { + "episode": 4848, + "epoch": 0.029046985656253372, + "loss/policy_avg": 0.38140204548835754, + "lr": 9.80700408997955e-06, + "objective/entropy": 23.643152236938477, + "objective/kl": 27.579925537109375, + "objective/non_score_reward": -1.3789963722229004, + "objective/rlhf_reward": -3.854125951946364, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 8.89024543762207, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58203125, + "step": 302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984774589538574 + }, + { + "episode": 4864, + "epoch": 0.02914285029538292, + "loss/policy_avg": 0.18466374278068542, + "lr": 9.806365030674847e-06, + "objective/entropy": -30.63671875, + "objective/kl": 25.678733825683594, + "objective/non_score_reward": -1.2839367389678955, + "objective/rlhf_reward": -3.6199750540577735, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 4.08036470413208, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999252200126648 + }, + { + "episode": 4880, + "epoch": 0.02923871493451247, + "loss/policy_avg": 0.20352232456207275, + "lr": 9.805725971370144e-06, + "objective/entropy": -14.465229034423828, + "objective/kl": 16.88151741027832, + "objective/non_score_reward": -0.8440757989883423, + "objective/rlhf_reward": 1.023696751892567, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.945369720458984, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51171875, + "step": 304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997638463973999 + }, + { + "episode": 4896, + "epoch": 0.02933457957364202, + "loss/policy_avg": 0.36892420053482056, + "lr": 9.80508691206544e-06, + "objective/entropy": 136.53363037109375, + "objective/kl": 30.262548446655273, + "objective/non_score_reward": -1.513127326965332, + "objective/rlhf_reward": -3.1287905319940776, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 30.166175842285156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.615234375, + "step": 305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001025199890137 + }, + { + "episode": 4912, + "epoch": 0.029430444212771568, + "loss/policy_avg": 0.07577557861804962, + "lr": 9.804447852760737e-06, + "objective/entropy": 77.17935943603516, + "objective/kl": 28.32352638244629, + "objective/non_score_reward": -1.4161763191223145, + "objective/rlhf_reward": -4.148933493884739, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 2.6957955360412598, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.63671875, + "step": 306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0025148391723633 + }, + { + "episode": 4928, + "epoch": 0.029526308851901117, + "loss/policy_avg": 0.1559610664844513, + "lr": 9.803808793456034e-06, + "objective/entropy": -16.938400268554688, + "objective/kl": 21.827743530273438, + "objective/non_score_reward": -1.091387152671814, + "objective/rlhf_reward": -2.703689043939696, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 7.885660171508789, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.431640625, + "step": 307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0027849674224854 + }, + { + "episode": 4944, + "epoch": 0.029622173491030666, + "loss/policy_avg": -0.17305535078048706, + "lr": 9.80316973415133e-06, + "objective/entropy": -31.412694931030273, + "objective/kl": 23.805431365966797, + "objective/non_score_reward": -1.1902716159820557, + "objective/rlhf_reward": -3.1569663322606853, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 35.29633331298828, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.654296875, + "step": 308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0007619857788086 + }, + { + "episode": 4960, + "epoch": 0.029718038130160215, + "loss/policy_avg": 0.13406828045845032, + "lr": 9.802530674846626e-06, + "objective/entropy": 68.0604248046875, + "objective/kl": 31.641517639160156, + "objective/non_score_reward": -1.582075834274292, + "objective/rlhf_reward": -4.949701407042843, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 32.652069091796875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3017578125, + "step": 309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981597661972046 + }, + { + "episode": 4976, + "epoch": 0.029813902769289764, + "loss/policy_avg": 0.3640270233154297, + "lr": 9.801891615541923e-06, + "objective/entropy": 73.73117065429688, + "objective/kl": 22.181957244873047, + "objective/non_score_reward": -1.109097957611084, + "objective/rlhf_reward": -4.436391651630402, + "objective/scores": 0.0, + "policy/approxkl_avg": 24.474929809570312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.77734375, + "step": 310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988360404968262 + }, + { + "episode": 4992, + "epoch": 0.029909767408419313, + "loss/policy_avg": 0.598778486251831, + "lr": 9.80125255623722e-06, + "objective/entropy": 77.45819854736328, + "objective/kl": 31.91500473022461, + "objective/non_score_reward": -1.5957502126693726, + "objective/rlhf_reward": -4.558172132047723, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 7.392116546630859, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.595703125, + "step": 311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976756572723389 + }, + { + "episode": 5008, + "epoch": 0.03000563204754886, + "loss/policy_avg": -0.14829277992248535, + "lr": 9.800613496932517e-06, + "objective/entropy": 73.91107940673828, + "objective/kl": 22.043235778808594, + "objective/non_score_reward": -1.1021617650985718, + "objective/rlhf_reward": -3.0086471796035763, + "objective/scores": 0.35, + "policy/approxkl_avg": 7.375496864318848, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.548828125, + "step": 312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0021305084228516 + }, + { + "episode": 5024, + "epoch": 0.03010149668667841, + "loss/policy_avg": 0.34449532628059387, + "lr": 9.799974437627812e-06, + "objective/entropy": 27.04425048828125, + "objective/kl": 31.98007583618164, + "objective/non_score_reward": -1.599003791809082, + "objective/rlhf_reward": -4.996015524864196, + "objective/scores": 0.35, + "policy/approxkl_avg": 53.630210876464844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.529296875, + "step": 313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990134239196777 + }, + { + "episode": 5040, + "epoch": 0.03019736132580796, + "loss/policy_avg": 0.029857225716114044, + "lr": 9.799335378323109e-06, + "objective/entropy": 147.96096801757812, + "objective/kl": 27.342838287353516, + "objective/non_score_reward": -1.3671419620513916, + "objective/rlhf_reward": -4.017969946475372, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 18.108400344848633, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.638671875, + "step": 314, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988747835159302 + }, + { + "episode": 5056, + "epoch": 0.03029322596493751, + "loss/policy_avg": 0.05283927917480469, + "lr": 9.798696319018406e-06, + "objective/entropy": -46.846099853515625, + "objective/kl": 30.715242385864258, + "objective/non_score_reward": -1.535762071609497, + "objective/rlhf_reward": -4.538928542200642, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 66.26033020019531, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6484375, + "step": 315, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992979764938354 + }, + { + "episode": 5072, + "epoch": 0.030389090604067057, + "loss/policy_avg": 0.2858242094516754, + "lr": 9.798057259713703e-06, + "objective/entropy": -156.9435577392578, + "objective/kl": 31.284622192382812, + "objective/non_score_reward": -1.5642311573028564, + "objective/rlhf_reward": -4.915288856535583, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 74.38943481445312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7421875, + "step": 316, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992833137512207 + }, + { + "episode": 5088, + "epoch": 0.030484955243196606, + "loss/policy_avg": 0.28274843096733093, + "lr": 9.797418200409e-06, + "objective/entropy": -214.69573974609375, + "objective/kl": 22.27606201171875, + "objective/non_score_reward": -1.1138031482696533, + "objective/rlhf_reward": -2.3325063607850414, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 35.48945236206055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.505859375, + "step": 317, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9962687492370605 + }, + { + "episode": 5104, + "epoch": 0.030580819882326155, + "loss/policy_avg": -0.08736838400363922, + "lr": 9.796779141104296e-06, + "objective/entropy": -18.148971557617188, + "objective/kl": 27.546077728271484, + "objective/non_score_reward": -1.377303957939148, + "objective/rlhf_reward": -4.1306134844697535, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 76.84832000732422, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6171875, + "step": 318, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0014290809631348 + }, + { + "episode": 5120, + "epoch": 0.030676684521455704, + "loss/policy_avg": 0.031098078936338425, + "lr": 9.796140081799592e-06, + "objective/entropy": 103.30211639404297, + "objective/kl": 27.747032165527344, + "objective/non_score_reward": -1.3873515129089355, + "objective/rlhf_reward": -4.033634447845158, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 54.69970703125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 319, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985463619232178 + }, + { + "episode": 5136, + "epoch": 0.030772549160585253, + "loss/policy_avg": 0.3622899651527405, + "lr": 9.795501022494888e-06, + "objective/entropy": 66.0567398071289, + "objective/kl": 26.39444351196289, + "objective/non_score_reward": -1.3197221755981445, + "objective/rlhf_reward": -3.7226295759349615, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 5.640605449676514, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 320, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992847442626953 + }, + { + "episode": 5152, + "epoch": 0.030868413799714802, + "loss/policy_avg": -0.10469883680343628, + "lr": 9.794861963190185e-06, + "objective/entropy": 35.81920623779297, + "objective/kl": 25.668739318847656, + "objective/non_score_reward": -1.2834370136260986, + "objective/rlhf_reward": -3.7337480843067166, + "objective/scores": 0.35, + "policy/approxkl_avg": 5.808808326721191, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6484375, + "step": 321, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999007225036621 + }, + { + "episode": 5168, + "epoch": 0.03096427843884435, + "loss/policy_avg": -0.2741212248802185, + "lr": 9.794222903885482e-06, + "objective/entropy": 52.38888168334961, + "objective/kl": 34.969974517822266, + "objective/non_score_reward": -1.748498797416687, + "objective/rlhf_reward": -5.652359655409484, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 8.913843154907227, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3798828125, + "step": 322, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0103840827941895 + }, + { + "episode": 5184, + "epoch": 0.0310601430779739, + "loss/policy_avg": 0.30122414231300354, + "lr": 9.793583844580777e-06, + "objective/entropy": 134.16075134277344, + "objective/kl": 25.608116149902344, + "objective/non_score_reward": -1.280405879020691, + "objective/rlhf_reward": -3.1742123318480804, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 74.33633422851562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.603515625, + "step": 323, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.014462471008301 + }, + { + "episode": 5200, + "epoch": 0.03115600771710345, + "loss/policy_avg": 0.26204991340637207, + "lr": 9.792944785276074e-06, + "objective/entropy": 2.559833526611328, + "objective/kl": 25.519519805908203, + "objective/non_score_reward": -1.2759759426116943, + "objective/rlhf_reward": -3.74465426180212, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 92.09954071044922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.57421875, + "step": 324, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998145580291748 + }, + { + "episode": 5216, + "epoch": 0.031251872356233, + "loss/policy_avg": 0.18864840269088745, + "lr": 9.792305725971371e-06, + "objective/entropy": 48.99184036254883, + "objective/kl": 28.022377014160156, + "objective/non_score_reward": -1.4011187553405762, + "objective/rlhf_reward": -4.123522403653025, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 22.120746612548828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.60546875, + "step": 325, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984712600708008 + }, + { + "episode": 5232, + "epoch": 0.03134773699536255, + "loss/policy_avg": 0.42162489891052246, + "lr": 9.791666666666666e-06, + "objective/entropy": -129.23065185546875, + "objective/kl": 31.687660217285156, + "objective/non_score_reward": -1.5843830108642578, + "objective/rlhf_reward": -4.821760052236256, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 111.98194885253906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.587890625, + "step": 326, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996734857559204 + }, + { + "episode": 5248, + "epoch": 0.0314436016344921, + "loss/policy_avg": -0.07900102436542511, + "lr": 9.791027607361963e-06, + "objective/entropy": 31.351696014404297, + "objective/kl": 27.038206100463867, + "objective/non_score_reward": -1.3519103527069092, + "objective/rlhf_reward": -5.407641291618347, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.7061767578125, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4423828125, + "step": 327, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0043118000030518 + }, + { + "episode": 5264, + "epoch": 0.03153946627362165, + "loss/policy_avg": 0.16587843000888824, + "lr": 9.79038854805726e-06, + "objective/entropy": 143.86651611328125, + "objective/kl": 27.42593765258789, + "objective/non_score_reward": -1.3712968826293945, + "objective/rlhf_reward": -4.125937962268276, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 119.49800872802734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.767578125, + "step": 328, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999497890472412 + }, + { + "episode": 5280, + "epoch": 0.0316353309127512, + "loss/policy_avg": 0.29106539487838745, + "lr": 9.789749488752557e-06, + "objective/entropy": 67.8651351928711, + "objective/kl": 32.114479064941406, + "objective/non_score_reward": -1.6057239770889282, + "objective/rlhf_reward": -5.08126013567987, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 7.976801872253418, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.470703125, + "step": 329, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0011985301971436 + }, + { + "episode": 5296, + "epoch": 0.031731195551880746, + "loss/policy_avg": 0.5780457258224487, + "lr": 9.789110429447854e-06, + "objective/entropy": 104.15371704101562, + "objective/kl": 30.92220687866211, + "objective/non_score_reward": -1.5461102724075317, + "objective/rlhf_reward": -3.784441030025482, + "objective/scores": 0.6, + "policy/approxkl_avg": 52.566375732421875, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.55859375, + "step": 330, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993044137954712 + }, + { + "episode": 5312, + "epoch": 0.031827060191010295, + "loss/policy_avg": 0.24728742241859436, + "lr": 9.78847137014315e-06, + "objective/entropy": -95.75634765625, + "objective/kl": 30.755779266357422, + "objective/non_score_reward": -1.5377888679504395, + "objective/rlhf_reward": -4.791905486319942, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 32.567970275878906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.666015625, + "step": 331, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991464614868164 + }, + { + "episode": 5328, + "epoch": 0.031922924830139844, + "loss/policy_avg": 1.9531396627426147, + "lr": 9.787832310838446e-06, + "objective/entropy": 18.057151794433594, + "objective/kl": 21.966590881347656, + "objective/non_score_reward": -1.0983295440673828, + "objective/rlhf_reward": -2.993318116664886, + "objective/scores": 0.35, + "policy/approxkl_avg": 11.555295944213867, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.716796875, + "step": 332, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.004971981048584 + }, + { + "episode": 5344, + "epoch": 0.03201878946926939, + "loss/policy_avg": 0.0304682869464159, + "lr": 9.787193251533743e-06, + "objective/entropy": -100.86114501953125, + "objective/kl": 21.19540023803711, + "objective/non_score_reward": -1.0597699880599976, + "objective/rlhf_reward": -2.8604777837670863, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 36.17786407470703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46875, + "step": 333, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997187852859497 + }, + { + "episode": 5360, + "epoch": 0.03211465410839894, + "loss/policy_avg": 0.2974792718887329, + "lr": 9.78655419222904e-06, + "objective/entropy": 59.0064697265625, + "objective/kl": 23.83527183532715, + "objective/non_score_reward": -1.1917636394500732, + "objective/rlhf_reward": -3.2861017016724343, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 27.08124542236328, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.76171875, + "step": 334, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0005297660827637 + }, + { + "episode": 5376, + "epoch": 0.03221051874752849, + "loss/policy_avg": 0.20310130715370178, + "lr": 9.785915132924337e-06, + "objective/entropy": 51.579200744628906, + "objective/kl": 26.064043045043945, + "objective/non_score_reward": -1.3032021522521973, + "objective/rlhf_reward": -3.656549363341883, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 3.1224026679992676, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.51171875, + "step": 335, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0020689964294434 + }, + { + "episode": 5392, + "epoch": 0.03230638338665804, + "loss/policy_avg": -0.22360196709632874, + "lr": 9.785276073619633e-06, + "objective/entropy": 8.019195556640625, + "objective/kl": 34.267356872558594, + "objective/non_score_reward": -1.7133680582046509, + "objective/rlhf_reward": -5.40287409266983, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 27.402694702148438, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.517578125, + "step": 336, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99736487865448 + }, + { + "episode": 5408, + "epoch": 0.03240224802578759, + "loss/policy_avg": 0.394004225730896, + "lr": 9.784637014314929e-06, + "objective/entropy": -7.316375732421875, + "objective/kl": 34.60337829589844, + "objective/non_score_reward": -1.7301688194274902, + "objective/rlhf_reward": -3.9969565018427105, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 60.58606719970703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.609375, + "step": 337, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990360736846924 + }, + { + "episode": 5424, + "epoch": 0.03249811266491714, + "loss/policy_avg": 0.08118537068367004, + "lr": 9.783997955010226e-06, + "objective/entropy": 3.808826446533203, + "objective/kl": 33.9757080078125, + "objective/non_score_reward": -1.6987853050231934, + "objective/rlhf_reward": -5.3713093592720895, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 49.47349548339844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.546875, + "step": 338, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974095821380615 + }, + { + "episode": 5440, + "epoch": 0.03259397730404669, + "loss/policy_avg": 0.1250596046447754, + "lr": 9.783358895705522e-06, + "objective/entropy": -42.7471809387207, + "objective/kl": 27.222618103027344, + "objective/non_score_reward": -1.361130952835083, + "objective/rlhf_reward": -3.9287524459683265, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 8.669515609741211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.509765625, + "step": 339, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002639293670654 + }, + { + "episode": 5456, + "epoch": 0.032689841943176236, + "loss/policy_avg": 1.2977867126464844, + "lr": 9.78271983640082e-06, + "objective/entropy": -60.51675796508789, + "objective/kl": 27.726932525634766, + "objective/non_score_reward": -1.3863465785980225, + "objective/rlhf_reward": -4.064433994706034, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 52.59510803222656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4345703125, + "step": 340, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984922409057617 + }, + { + "episode": 5472, + "epoch": 0.032785706582305785, + "loss/policy_avg": 0.10771232098340988, + "lr": 9.782080777096116e-06, + "objective/entropy": 39.22501754760742, + "objective/kl": 38.581573486328125, + "objective/non_score_reward": -1.9290788173675537, + "objective/rlhf_reward": -6.374679616003662, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 16.336502075195312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.599609375, + "step": 341, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990124702453613 + }, + { + "episode": 5488, + "epoch": 0.032881571221435334, + "loss/policy_avg": 0.029969744384288788, + "lr": 9.781441717791413e-06, + "objective/entropy": 54.763675689697266, + "objective/kl": 27.586057662963867, + "objective/non_score_reward": -1.379302978515625, + "objective/rlhf_reward": -3.7838785807291666, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 29.997591018676758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4462890625, + "step": 342, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992620944976807 + }, + { + "episode": 5504, + "epoch": 0.03297743586056488, + "loss/policy_avg": -0.003006638027727604, + "lr": 9.780802658486708e-06, + "objective/entropy": 4.6327056884765625, + "objective/kl": 25.01122283935547, + "objective/non_score_reward": -1.250560998916626, + "objective/rlhf_reward": -3.054833005146916, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.332850694656372, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.51171875, + "step": 343, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0042150020599365 + }, + { + "episode": 5520, + "epoch": 0.03307330049969443, + "loss/policy_avg": -0.2595655918121338, + "lr": 9.780163599182005e-06, + "objective/entropy": -9.382579803466797, + "objective/kl": 25.310394287109375, + "objective/non_score_reward": -1.2655197381973267, + "objective/rlhf_reward": -3.611480812640533, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 35.86376190185547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.708984375, + "step": 344, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991991519927979 + }, + { + "episode": 5536, + "epoch": 0.03316916513882398, + "loss/policy_avg": 1.6723182201385498, + "lr": 9.7795245398773e-06, + "objective/entropy": 167.249267578125, + "objective/kl": 38.30883026123047, + "objective/non_score_reward": -1.915441632270813, + "objective/rlhf_reward": -6.283164360610348, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 27.648231506347656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.521484375, + "step": 345, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9966659545898438 + }, + { + "episode": 5552, + "epoch": 0.03326502977795353, + "loss/policy_avg": 0.21136921644210815, + "lr": 9.778885480572597e-06, + "objective/entropy": 202.48263549804688, + "objective/kl": 28.62633514404297, + "objective/non_score_reward": -1.4313167333602905, + "objective/rlhf_reward": -4.169007628169611, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 28.591995239257812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 346, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9962763786315918 + }, + { + "episode": 5568, + "epoch": 0.03336089441708308, + "loss/policy_avg": 0.030091844499111176, + "lr": 9.778246421267894e-06, + "objective/entropy": 178.1235809326172, + "objective/kl": 37.731300354003906, + "objective/non_score_reward": -1.8865652084350586, + "objective/rlhf_reward": -5.990001528468683, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 17.381601333618164, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.52734375, + "step": 347, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001326322555542 + }, + { + "episode": 5584, + "epoch": 0.03345675905621263, + "loss/policy_avg": 0.40717682242393494, + "lr": 9.777607361963191e-06, + "objective/entropy": 90.73904418945312, + "objective/kl": 31.88462257385254, + "objective/non_score_reward": -1.594231128692627, + "objective/rlhf_reward": -5.0176747677072715, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 37.96768569946289, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5703125, + "step": 348, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991170167922974 + }, + { + "episode": 5600, + "epoch": 0.033552623695342176, + "loss/policy_avg": 0.5422201156616211, + "lr": 9.776968302658488e-06, + "objective/entropy": 80.41102600097656, + "objective/kl": 34.64447021484375, + "objective/non_score_reward": -1.7322235107421875, + "objective/rlhf_reward": -5.478295783610687, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 117.23408508300781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.54296875, + "step": 349, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9983158111572266 + }, + { + "episode": 5616, + "epoch": 0.033648488334471725, + "loss/policy_avg": 0.3756037950515747, + "lr": 9.776329243353783e-06, + "objective/entropy": 61.65838623046875, + "objective/kl": 44.269325256347656, + "objective/non_score_reward": -2.213466167449951, + "objective/rlhf_reward": -7.40326676806961, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 19.3502254486084, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.583984375, + "step": 350, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988865852355957 + }, + { + "episode": 5632, + "epoch": 0.033744352973601274, + "loss/policy_avg": 0.9775654673576355, + "lr": 9.77569018404908e-06, + "objective/entropy": 57.90337371826172, + "objective/kl": 41.80830383300781, + "objective/non_score_reward": -2.0904150009155273, + "objective/rlhf_reward": -6.628326908747354, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 84.0235824584961, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.84765625, + "step": 351, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9960914850234985 + }, + { + "episode": 5648, + "epoch": 0.03384021761273082, + "loss/policy_avg": -0.20816992223262787, + "lr": 9.775051124744377e-06, + "objective/entropy": -118.41542053222656, + "objective/kl": 23.201061248779297, + "objective/non_score_reward": -1.160053014755249, + "objective/rlhf_reward": -2.8153834894028416, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 4.062729835510254, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.4453125, + "step": 352, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0035219192504883 + }, + { + "episode": 5664, + "epoch": 0.03393608225186037, + "loss/policy_avg": 0.035901207476854324, + "lr": 9.774412065439674e-06, + "objective/entropy": 154.33920288085938, + "objective/kl": 28.773828506469727, + "objective/non_score_reward": -1.4386913776397705, + "objective/rlhf_reward": -2.8310468539011207, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 25.329944610595703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4755859375, + "step": 353, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003747940063477 + }, + { + "episode": 5680, + "epoch": 0.03403194689098992, + "loss/policy_avg": 0.7185342311859131, + "lr": 9.77377300613497e-06, + "objective/entropy": 45.80010986328125, + "objective/kl": 35.51177215576172, + "objective/non_score_reward": -1.7755887508392334, + "objective/rlhf_reward": -5.586583339961704, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 69.95939636230469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51953125, + "step": 354, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996915340423584 + }, + { + "episode": 5696, + "epoch": 0.03412781153011947, + "loss/policy_avg": 0.871320903301239, + "lr": 9.773133946830267e-06, + "objective/entropy": 136.34942626953125, + "objective/kl": 37.25979995727539, + "objective/non_score_reward": -1.862990140914917, + "objective/rlhf_reward": -5.504549334721501, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 58.879180908203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 355, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990177154541016 + }, + { + "episode": 5712, + "epoch": 0.03422367616924902, + "loss/policy_avg": 0.14556461572647095, + "lr": 9.772494887525563e-06, + "objective/entropy": -10.28516960144043, + "objective/kl": 29.231609344482422, + "objective/non_score_reward": -1.461580514907837, + "objective/rlhf_reward": -4.021493013176035, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 39.2762451171875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859375, + "step": 356, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985218048095703 + }, + { + "episode": 5728, + "epoch": 0.03431954080837857, + "loss/policy_avg": 0.27659082412719727, + "lr": 9.77185582822086e-06, + "objective/entropy": -36.31108093261719, + "objective/kl": 32.386661529541016, + "objective/non_score_reward": -1.619333028793335, + "objective/rlhf_reward": -6.47733199596405, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.265704154968262, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71484375, + "step": 357, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992687702178955 + }, + { + "episode": 5744, + "epoch": 0.03441540544750812, + "loss/policy_avg": 0.10546956956386566, + "lr": 9.771216768916156e-06, + "objective/entropy": 79.19872283935547, + "objective/kl": 22.353626251220703, + "objective/non_score_reward": -1.1176813840866089, + "objective/rlhf_reward": -3.0201275154069513, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 20.73809051513672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4873046875, + "step": 358, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974703788757324 + }, + { + "episode": 5760, + "epoch": 0.034511270086637666, + "loss/policy_avg": 0.5648351311683655, + "lr": 9.770577709611453e-06, + "objective/entropy": 38.47356033325195, + "objective/kl": 23.87390899658203, + "objective/non_score_reward": -1.1936955451965332, + "objective/rlhf_reward": -3.4331463485056455, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 8.14659595489502, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53125, + "step": 359, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0011978149414062 + }, + { + "episode": 5776, + "epoch": 0.034607134725767215, + "loss/policy_avg": 0.5912380814552307, + "lr": 9.76993865030675e-06, + "objective/entropy": 116.97152709960938, + "objective/kl": 40.231689453125, + "objective/non_score_reward": -2.011584758758545, + "objective/rlhf_reward": -6.565385702069163, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 117.33955383300781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.666015625, + "step": 360, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9969582557678223 + }, + { + "episode": 5792, + "epoch": 0.034702999364896764, + "loss/policy_avg": -0.019477106630802155, + "lr": 9.769299591002045e-06, + "objective/entropy": -144.96791076660156, + "objective/kl": 27.773448944091797, + "objective/non_score_reward": -1.3886725902557373, + "objective/rlhf_reward": -5.554690062999725, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.48216438293457, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.626953125, + "step": 361, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000136137008667 + }, + { + "episode": 5808, + "epoch": 0.03479886400402631, + "loss/policy_avg": -0.5155759453773499, + "lr": 9.768660531697342e-06, + "objective/entropy": 78.00074768066406, + "objective/kl": 34.501590728759766, + "objective/non_score_reward": -1.7250795364379883, + "objective/rlhf_reward": -5.521715917674404, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 122.40145874023438, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.513671875, + "step": 362, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.02427077293396 + }, + { + "episode": 5824, + "epoch": 0.03489472864315586, + "loss/policy_avg": 0.3520805537700653, + "lr": 9.768021472392639e-06, + "objective/entropy": -66.29779815673828, + "objective/kl": 23.767650604248047, + "objective/non_score_reward": -1.188382625579834, + "objective/rlhf_reward": -3.3535303235054013, + "objective/scores": 0.35, + "policy/approxkl_avg": 66.86349487304688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.51953125, + "step": 363, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973565340042114 + }, + { + "episode": 5840, + "epoch": 0.03499059328228541, + "loss/policy_avg": 0.25808075070381165, + "lr": 9.767382413087936e-06, + "objective/entropy": 55.69321060180664, + "objective/kl": 32.73713684082031, + "objective/non_score_reward": -1.6368569135665894, + "objective/rlhf_reward": -4.147427594661712, + "objective/scores": 0.6, + "policy/approxkl_avg": 17.00968360900879, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 364, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998908519744873 + }, + { + "episode": 5856, + "epoch": 0.03508645792141496, + "loss/policy_avg": -0.33678027987480164, + "lr": 9.766743353783233e-06, + "objective/entropy": 63.459205627441406, + "objective/kl": 36.74503707885742, + "objective/non_score_reward": -1.837251901626587, + "objective/rlhf_reward": -5.226301344410453, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 63.5507926940918, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7734375, + "step": 365, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005531311035156 + }, + { + "episode": 5872, + "epoch": 0.03518232256054451, + "loss/policy_avg": 0.397920161485672, + "lr": 9.76610429447853e-06, + "objective/entropy": -11.37314224243164, + "objective/kl": 32.99299240112305, + "objective/non_score_reward": -1.6496496200561523, + "objective/rlhf_reward": -5.174766202171413, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 28.19782257080078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65625, + "step": 366, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984650611877441 + }, + { + "episode": 5888, + "epoch": 0.03527818719967406, + "loss/policy_avg": 0.5101684331893921, + "lr": 9.765465235173825e-06, + "objective/entropy": 122.12913513183594, + "objective/kl": 39.20099639892578, + "objective/non_score_reward": -1.9600497484207153, + "objective/rlhf_reward": -6.480949008201046, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 10.180255889892578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.45703125, + "step": 367, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976162910461426 + }, + { + "episode": 5904, + "epoch": 0.03537405183880361, + "loss/policy_avg": -0.46757811307907104, + "lr": 9.764826175869122e-06, + "objective/entropy": -108.47764587402344, + "objective/kl": 25.862443923950195, + "objective/non_score_reward": -1.2931220531463623, + "objective/rlhf_reward": -3.6162289073138982, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.3750016689300537, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.703125, + "step": 368, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0032150745391846 + }, + { + "episode": 5920, + "epoch": 0.035469916477933155, + "loss/policy_avg": 0.12928390502929688, + "lr": 9.764187116564417e-06, + "objective/entropy": 47.25078201293945, + "objective/kl": 23.20449447631836, + "objective/non_score_reward": -1.1602246761322021, + "objective/rlhf_reward": -2.240898942947388, + "objective/scores": 0.6, + "policy/approxkl_avg": 2.1992838382720947, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.544921875, + "step": 369, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0065484046936035 + }, + { + "episode": 5936, + "epoch": 0.035565781117062704, + "loss/policy_avg": 0.15939241647720337, + "lr": 9.763548057259714e-06, + "objective/entropy": -19.609264373779297, + "objective/kl": 28.25977325439453, + "objective/non_score_reward": -1.4129884243011475, + "objective/rlhf_reward": -4.273351618138653, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 59.99807357788086, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 370, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0036120414733887 + }, + { + "episode": 5952, + "epoch": 0.03566164575619225, + "loss/policy_avg": 0.1767190843820572, + "lr": 9.76290899795501e-06, + "objective/entropy": -11.536600112915039, + "objective/kl": 36.28870391845703, + "objective/non_score_reward": -1.8144354820251465, + "objective/rlhf_reward": -7.257741451263428, + "objective/scores": 0.0, + "policy/approxkl_avg": 11.846475601196289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 371, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99981689453125 + }, + { + "episode": 5968, + "epoch": 0.0357575103953218, + "loss/policy_avg": 0.3314260244369507, + "lr": 9.762269938650308e-06, + "objective/entropy": -30.279476165771484, + "objective/kl": 28.756494522094727, + "objective/non_score_reward": -1.4378247261047363, + "objective/rlhf_reward": -4.300700943084106, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 39.342529296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.623046875, + "step": 372, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998626708984375 + }, + { + "episode": 5984, + "epoch": 0.03585337503445135, + "loss/policy_avg": 0.18494009971618652, + "lr": 9.761630879345604e-06, + "objective/entropy": 68.65098571777344, + "objective/kl": 36.555747985839844, + "objective/non_score_reward": -1.8277872800827026, + "objective/rlhf_reward": -5.486320610317301, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 10.346623420715332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.517578125, + "step": 373, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000062942504883 + }, + { + "episode": 6000, + "epoch": 0.03594923967358091, + "loss/policy_avg": -0.05254024267196655, + "lr": 9.7609918200409e-06, + "objective/entropy": -30.816913604736328, + "objective/kl": 26.80430793762207, + "objective/non_score_reward": -1.3402154445648193, + "objective/rlhf_reward": -3.845090114864048, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 3.3415722846984863, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.583984375, + "step": 374, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991049766540527 + }, + { + "episode": 6016, + "epoch": 0.036045104312710456, + "loss/policy_avg": 0.48673489689826965, + "lr": 9.760352760736196e-06, + "objective/entropy": -54.172760009765625, + "objective/kl": 26.726612091064453, + "objective/non_score_reward": -1.3363306522369385, + "objective/rlhf_reward": -0.945322489738464, + "objective/scores": 1.1, + "policy/approxkl_avg": 36.357444763183594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.708984375, + "step": 375, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999312400817871 + }, + { + "episode": 6032, + "epoch": 0.036140968951840005, + "loss/policy_avg": -0.06733483076095581, + "lr": 9.759713701431493e-06, + "objective/entropy": 135.20721435546875, + "objective/kl": 37.13209915161133, + "objective/non_score_reward": -1.856605052947998, + "objective/rlhf_reward": -4.5027009590875835, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 17.97521209716797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4873046875, + "step": 376, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002077341079712 + }, + { + "episode": 6048, + "epoch": 0.036236833590969554, + "loss/policy_avg": -0.041654448956251144, + "lr": 9.75907464212679e-06, + "objective/entropy": -167.12548828125, + "objective/kl": 25.773399353027344, + "objective/non_score_reward": -1.2886700630187988, + "objective/rlhf_reward": -0.7546801328659054, + "objective/scores": 1.1, + "policy/approxkl_avg": 0.800922691822052, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.521484375, + "step": 377, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000338315963745 + }, + { + "episode": 6064, + "epoch": 0.0363326982300991, + "loss/policy_avg": 0.03024141490459442, + "lr": 9.758435582822087e-06, + "objective/entropy": -73.82417297363281, + "objective/kl": 26.33017349243164, + "objective/non_score_reward": -1.3165086507797241, + "objective/rlhf_reward": -3.14332831122068, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 23.01593780517578, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.63671875, + "step": 378, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002521276473999 + }, + { + "episode": 6080, + "epoch": 0.03642856286922865, + "loss/policy_avg": 0.285569429397583, + "lr": 9.757796523517384e-06, + "objective/entropy": -111.42575073242188, + "objective/kl": 28.885374069213867, + "objective/non_score_reward": -1.4442687034606934, + "objective/rlhf_reward": -4.398472824183804, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 79.57511901855469, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.61328125, + "step": 379, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979077577590942 + }, + { + "episode": 6096, + "epoch": 0.0365244275083582, + "loss/policy_avg": -0.022392742335796356, + "lr": 9.75715746421268e-06, + "objective/entropy": -79.86695098876953, + "objective/kl": 17.694236755371094, + "objective/non_score_reward": -0.8847118616104126, + "objective/rlhf_reward": -1.7140187576142063, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 4.339657306671143, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.767578125, + "step": 380, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0013508796691895 + }, + { + "episode": 6112, + "epoch": 0.03662029214748775, + "loss/policy_avg": 0.4459357261657715, + "lr": 9.756518404907976e-06, + "objective/entropy": -148.62872314453125, + "objective/kl": 21.098934173583984, + "objective/non_score_reward": -1.054946780204773, + "objective/rlhf_reward": 0.1802129983901981, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.359186172485352, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.576171875, + "step": 381, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992458820343018 + }, + { + "episode": 6128, + "epoch": 0.0367161567866173, + "loss/policy_avg": -0.012147974222898483, + "lr": 9.755879345603273e-06, + "objective/entropy": 152.35232543945312, + "objective/kl": 31.486684799194336, + "objective/non_score_reward": -1.5743342638015747, + "objective/rlhf_reward": -3.3736180409204692, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 35.51153564453125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859375, + "step": 382, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999483585357666 + }, + { + "episode": 6144, + "epoch": 0.03681202142574685, + "loss/policy_avg": 0.012859173119068146, + "lr": 9.75524028629857e-06, + "objective/entropy": 26.343887329101562, + "objective/kl": 33.34328079223633, + "objective/non_score_reward": -1.6671642065048218, + "objective/rlhf_reward": -4.721245358662541, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 22.614994049072266, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.82421875, + "step": 383, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0018911361694336 + }, + { + "episode": 6160, + "epoch": 0.0369078860648764, + "loss/policy_avg": 0.21653258800506592, + "lr": 9.754601226993867e-06, + "objective/entropy": 109.49678039550781, + "objective/kl": 43.73469543457031, + "objective/non_score_reward": -2.186734676361084, + "objective/rlhf_reward": -7.296340326876983, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 31.000137329101562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.689453125, + "step": 384, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001706123352051 + }, + { + "episode": 6176, + "epoch": 0.037003750704005946, + "loss/policy_avg": 0.17637991905212402, + "lr": 9.753962167689162e-06, + "objective/entropy": -57.256038665771484, + "objective/kl": 20.548786163330078, + "objective/non_score_reward": -1.0274393558502197, + "objective/rlhf_reward": -1.9870514891305304, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.945226669311523, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.71484375, + "step": 385, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000217914581299 + }, + { + "episode": 6192, + "epoch": 0.037099615343135495, + "loss/policy_avg": 0.23474755883216858, + "lr": 9.753323108384459e-06, + "objective/entropy": -67.67970275878906, + "objective/kl": 29.886417388916016, + "objective/non_score_reward": -1.4943209886550903, + "objective/rlhf_reward": -4.461511933597263, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 30.2872314453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.517578125, + "step": 386, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9968770742416382 + }, + { + "episode": 6208, + "epoch": 0.037195479982265044, + "loss/policy_avg": 3.0326309204101562, + "lr": 9.752684049079756e-06, + "objective/entropy": -30.304298400878906, + "objective/kl": 34.21199035644531, + "objective/non_score_reward": -1.710599660873413, + "objective/rlhf_reward": -5.391800324530944, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 191.66567993164062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 387, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998448133468628 + }, + { + "episode": 6224, + "epoch": 0.03729134462139459, + "loss/policy_avg": 0.020012550055980682, + "lr": 9.752044989775053e-06, + "objective/entropy": -44.4876594543457, + "objective/kl": 30.23657989501953, + "objective/non_score_reward": -1.5118290185928345, + "objective/rlhf_reward": -4.099904905037816, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 21.57486915588379, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.767578125, + "step": 388, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002869129180908 + }, + { + "episode": 6240, + "epoch": 0.03738720926052414, + "loss/policy_avg": 0.33562996983528137, + "lr": 9.751405930470348e-06, + "objective/entropy": -154.47891235351562, + "objective/kl": 18.6168155670166, + "objective/non_score_reward": -0.9308407306671143, + "objective/rlhf_reward": -3.723362982273102, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.14146614074707, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65625, + "step": 389, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002055168151855 + }, + { + "episode": 6256, + "epoch": 0.03748307389965369, + "loss/policy_avg": 0.037651438266038895, + "lr": 9.750766871165645e-06, + "objective/entropy": -6.050981521606445, + "objective/kl": 26.29869270324707, + "objective/non_score_reward": -1.3149347305297852, + "objective/rlhf_reward": -5.25973904132843, + "objective/scores": 0.0, + "policy/approxkl_avg": 27.001697540283203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.759765625, + "step": 390, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982492923736572 + }, + { + "episode": 6272, + "epoch": 0.03757893853878324, + "loss/policy_avg": 0.1277342140674591, + "lr": 9.750127811860941e-06, + "objective/entropy": -114.59310913085938, + "objective/kl": 33.31782531738281, + "objective/non_score_reward": -1.6658912897109985, + "objective/rlhf_reward": -3.739846025348875, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 30.69461441040039, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.755859375, + "step": 391, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998853206634521 + }, + { + "episode": 6288, + "epoch": 0.03767480317791279, + "loss/policy_avg": 0.08161749690771103, + "lr": 9.749488752556238e-06, + "objective/entropy": 28.02770233154297, + "objective/kl": 25.580188751220703, + "objective/non_score_reward": -1.279009461402893, + "objective/rlhf_reward": -3.6654397054627985, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 11.5637845993042, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3642578125, + "step": 392, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9969701766967773 + }, + { + "episode": 6304, + "epoch": 0.03777066781704234, + "loss/policy_avg": 0.013617899268865585, + "lr": 9.748849693251534e-06, + "objective/entropy": 137.66958618164062, + "objective/kl": 36.88829040527344, + "objective/non_score_reward": -1.8444143533706665, + "objective/rlhf_reward": -5.999055602637631, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 3.8839926719665527, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.703125, + "step": 393, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998722076416016 + }, + { + "episode": 6320, + "epoch": 0.037866532456171886, + "loss/policy_avg": 0.7664667963981628, + "lr": 9.74821063394683e-06, + "objective/entropy": 12.1875, + "objective/kl": 27.703767776489258, + "objective/non_score_reward": -1.385188341140747, + "objective/rlhf_reward": -4.181503379081173, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 29.00311279296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.64453125, + "step": 394, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983487129211426 + }, + { + "episode": 6336, + "epoch": 0.037962397095301435, + "loss/policy_avg": 0.13891640305519104, + "lr": 9.747571574642127e-06, + "objective/entropy": -52.291236877441406, + "objective/kl": 29.62856101989746, + "objective/non_score_reward": -1.4814281463623047, + "objective/rlhf_reward": -4.10088383701713, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 45.48643112182617, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.763671875, + "step": 395, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9967341423034668 + }, + { + "episode": 6352, + "epoch": 0.038058261734430984, + "loss/policy_avg": -0.5259265899658203, + "lr": 9.746932515337424e-06, + "objective/entropy": -14.848602294921875, + "objective/kl": 36.51825714111328, + "objective/non_score_reward": -1.8259128332138062, + "objective/rlhf_reward": -5.180944981352363, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 133.36766052246094, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.673828125, + "step": 396, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.008728504180908 + }, + { + "episode": 6368, + "epoch": 0.03815412637356053, + "loss/policy_avg": 0.1340530812740326, + "lr": 9.746293456032721e-06, + "objective/entropy": -13.48861312866211, + "objective/kl": 24.147233963012695, + "objective/non_score_reward": -1.2073616981506348, + "objective/rlhf_reward": -3.0961134592692057, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 4.865433216094971, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.787109375, + "step": 397, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0002119541168213 + }, + { + "episode": 6384, + "epoch": 0.03824999101269008, + "loss/policy_avg": 0.036313191056251526, + "lr": 9.745654396728016e-06, + "objective/entropy": -118.45596313476562, + "objective/kl": 26.90463638305664, + "objective/non_score_reward": -1.3452317714691162, + "objective/rlhf_reward": -3.5560982182350864, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 7.153594017028809, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.708984375, + "step": 398, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009984970092773 + }, + { + "episode": 6400, + "epoch": 0.03834585565181963, + "loss/policy_avg": 0.07543957978487015, + "lr": 9.745015337423313e-06, + "objective/entropy": 5.307586669921875, + "objective/kl": 29.030933380126953, + "objective/non_score_reward": -1.4515466690063477, + "objective/rlhf_reward": -2.88246778094885, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 14.018705368041992, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.611328125, + "step": 399, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984686374664307 + }, + { + "episode": 6416, + "epoch": 0.03844172029094918, + "loss/policy_avg": 0.11864852905273438, + "lr": 9.74437627811861e-06, + "objective/entropy": 10.484695434570312, + "objective/kl": 24.462554931640625, + "objective/non_score_reward": -1.2231277227401733, + "objective/rlhf_reward": -3.376739227565464, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 14.423017501831055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.685546875, + "step": 400, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988946914672852 + }, + { + "episode": 6432, + "epoch": 0.03853758493007873, + "loss/policy_avg": -0.036792345345020294, + "lr": 9.743737218813907e-06, + "objective/entropy": -181.87400817871094, + "objective/kl": 23.07555389404297, + "objective/non_score_reward": -1.153777837753296, + "objective/rlhf_reward": -3.191279132564632, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 20.132736206054688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.708984375, + "step": 401, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.00081729888916 + }, + { + "episode": 6448, + "epoch": 0.03863344956920828, + "loss/policy_avg": 0.22927281260490417, + "lr": 9.743098159509204e-06, + "objective/entropy": -88.96450805664062, + "objective/kl": 32.569129943847656, + "objective/non_score_reward": -1.6284565925598145, + "objective/rlhf_reward": -4.780492917696634, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 49.499900817871094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 402, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982388019561768 + }, + { + "episode": 6464, + "epoch": 0.03872931420833783, + "loss/policy_avg": 0.30984753370285034, + "lr": 9.7424591002045e-06, + "objective/entropy": -18.365474700927734, + "objective/kl": 31.77776336669922, + "objective/non_score_reward": -1.5888882875442505, + "objective/rlhf_reward": -5.030040267735643, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 36.973690032958984, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.607421875, + "step": 403, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9970709085464478 + }, + { + "episode": 6480, + "epoch": 0.038825178847467376, + "loss/policy_avg": 0.06557717174291611, + "lr": 9.741820040899796e-06, + "objective/entropy": -141.13568115234375, + "objective/kl": 28.107177734375, + "objective/non_score_reward": -1.405358910560608, + "objective/rlhf_reward": -3.674024294094975, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 31.192813873291016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.791015625, + "step": 404, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9952688217163086 + }, + { + "episode": 6496, + "epoch": 0.038921043486596925, + "loss/policy_avg": 0.05502002686262131, + "lr": 9.741180981595093e-06, + "objective/entropy": 32.80726623535156, + "objective/kl": 44.297119140625, + "objective/non_score_reward": -2.2148561477661133, + "objective/rlhf_reward": -7.5001741287454795, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 19.57358169555664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.650390625, + "step": 405, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999144077301025 + }, + { + "episode": 6512, + "epoch": 0.039016908125726474, + "loss/policy_avg": 0.026680059731006622, + "lr": 9.74054192229039e-06, + "objective/entropy": 119.29817962646484, + "objective/kl": 39.39287567138672, + "objective/non_score_reward": -1.9696437120437622, + "objective/rlhf_reward": -6.536938837080627, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 0.6370775699615479, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6171875, + "step": 406, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0018253326416016 + }, + { + "episode": 6528, + "epoch": 0.03911277276485602, + "loss/policy_avg": 0.6271831393241882, + "lr": 9.739902862985686e-06, + "objective/entropy": 6.752727508544922, + "objective/kl": 25.43050765991211, + "objective/non_score_reward": -1.2715253829956055, + "objective/rlhf_reward": -5.086101770401001, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.81015396118164, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.74609375, + "step": 407, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977548122406006 + }, + { + "episode": 6544, + "epoch": 0.03920863740398557, + "loss/policy_avg": 0.30096232891082764, + "lr": 9.739263803680983e-06, + "objective/entropy": -24.516462326049805, + "objective/kl": 38.53913116455078, + "objective/non_score_reward": -1.9269566535949707, + "objective/rlhf_reward": -5.585120143667732, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 15.016406059265137, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.603515625, + "step": 408, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.994694709777832 + }, + { + "episode": 6560, + "epoch": 0.03930450204311512, + "loss/policy_avg": 0.03762083500623703, + "lr": 9.73862474437628e-06, + "objective/entropy": -218.5489501953125, + "objective/kl": 26.699615478515625, + "objective/non_score_reward": -1.3349807262420654, + "objective/rlhf_reward": -3.6780635170346363, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 59.4561653137207, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.54296875, + "step": 409, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982738494873047 + }, + { + "episode": 6576, + "epoch": 0.03940036668224467, + "loss/policy_avg": 0.2932765483856201, + "lr": 9.737985685071575e-06, + "objective/entropy": -25.477672576904297, + "objective/kl": 35.529788970947266, + "objective/non_score_reward": -1.776489496231079, + "objective/rlhf_reward": -5.372624413172403, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 39.98287582397461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.55859375, + "step": 410, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999699354171753 + }, + { + "episode": 6592, + "epoch": 0.03949623132137422, + "loss/policy_avg": -0.2486688196659088, + "lr": 9.737346625766872e-06, + "objective/entropy": -12.952373504638672, + "objective/kl": 33.62919616699219, + "objective/non_score_reward": -1.681459903717041, + "objective/rlhf_reward": -4.778428207116063, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 28.626731872558594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.56640625, + "step": 411, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.003122568130493 + }, + { + "episode": 6608, + "epoch": 0.03959209596050377, + "loss/policy_avg": 0.3249208629131317, + "lr": 9.736707566462167e-06, + "objective/entropy": -52.927459716796875, + "objective/kl": 33.82263946533203, + "objective/non_score_reward": -1.6911320686340332, + "objective/rlhf_reward": -4.364528393745422, + "objective/scores": 0.6, + "policy/approxkl_avg": 41.674591064453125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.791015625, + "step": 412, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000582695007324 + }, + { + "episode": 6624, + "epoch": 0.039687960599633317, + "loss/policy_avg": 0.15019002556800842, + "lr": 9.736068507157464e-06, + "objective/entropy": -22.71458625793457, + "objective/kl": 32.99541473388672, + "objective/non_score_reward": -1.6497704982757568, + "objective/rlhf_reward": -5.257446458845763, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 6.256417274475098, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 413, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978971481323242 + }, + { + "episode": 6640, + "epoch": 0.039783825238762865, + "loss/policy_avg": 0.296099990606308, + "lr": 9.735429447852761e-06, + "objective/entropy": -10.485054016113281, + "objective/kl": 28.53786277770996, + "objective/non_score_reward": -1.4268931150436401, + "objective/rlhf_reward": -3.9742393652598063, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 4.458545684814453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.703125, + "step": 414, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996311664581299 + }, + { + "episode": 6656, + "epoch": 0.039879689877892414, + "loss/policy_avg": 0.3615366816520691, + "lr": 9.734790388548058e-06, + "objective/entropy": -102.9046859741211, + "objective/kl": 19.901390075683594, + "objective/non_score_reward": -0.9950695037841797, + "objective/rlhf_reward": -2.3184185675984486, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 18.427024841308594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.82421875, + "step": 415, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999051809310913 + }, + { + "episode": 6672, + "epoch": 0.03997555451702196, + "loss/policy_avg": 0.14772659540176392, + "lr": 9.734151329243355e-06, + "objective/entropy": -148.49395751953125, + "objective/kl": 26.190744400024414, + "objective/non_score_reward": -1.3095372915267944, + "objective/rlhf_reward": -3.6340291834512524, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 59.936073303222656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.806640625, + "step": 416, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001544237136841 + }, + { + "episode": 6688, + "epoch": 0.04007141915615151, + "loss/policy_avg": 0.23557257652282715, + "lr": 9.73351226993865e-06, + "objective/entropy": -145.32284545898438, + "objective/kl": 30.992046356201172, + "objective/non_score_reward": -1.5496025085449219, + "objective/rlhf_reward": -3.7984096765518185, + "objective/scores": 0.6, + "policy/approxkl_avg": 7.065143585205078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.904296875, + "step": 417, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989118576049805 + }, + { + "episode": 6704, + "epoch": 0.04016728379528106, + "loss/policy_avg": 0.12179827690124512, + "lr": 9.732873210633947e-06, + "objective/entropy": -64.65836334228516, + "objective/kl": 35.22796630859375, + "objective/non_score_reward": -1.7613983154296875, + "objective/rlhf_reward": -5.686343335841579, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 61.170570373535156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5546875, + "step": 418, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985511302947998 + }, + { + "episode": 6720, + "epoch": 0.04026314843441061, + "loss/policy_avg": -0.043803490698337555, + "lr": 9.732234151329244e-06, + "objective/entropy": -87.70707702636719, + "objective/kl": 28.95832061767578, + "objective/non_score_reward": -1.447916030883789, + "objective/rlhf_reward": -4.275892340930637, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 2.8885676860809326, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.763671875, + "step": 419, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9999752044677734 + }, + { + "episode": 6736, + "epoch": 0.04035901307354016, + "loss/policy_avg": 0.18042519688606262, + "lr": 9.73159509202454e-06, + "objective/entropy": -4.936176300048828, + "objective/kl": 30.613567352294922, + "objective/non_score_reward": -1.5306785106658936, + "objective/rlhf_reward": -4.722713804244995, + "objective/scores": 0.35, + "policy/approxkl_avg": 209.10888671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.619140625, + "step": 420, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993661642074585 + }, + { + "episode": 6752, + "epoch": 0.04045487771266971, + "loss/policy_avg": 0.6567588448524475, + "lr": 9.730956032719838e-06, + "objective/entropy": -162.10116577148438, + "objective/kl": 33.140079498291016, + "objective/non_score_reward": -1.6570039987564087, + "objective/rlhf_reward": -4.505309881941352, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 33.703067779541016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7578125, + "step": 421, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9999632835388184 + }, + { + "episode": 6768, + "epoch": 0.04055074235179926, + "loss/policy_avg": 0.5961964130401611, + "lr": 9.730316973415135e-06, + "objective/entropy": 18.374740600585938, + "objective/kl": 36.82442092895508, + "objective/non_score_reward": -1.8412209749221802, + "objective/rlhf_reward": -4.441164646984312, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 62.1960334777832, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.654296875, + "step": 422, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999306201934814 + }, + { + "episode": 6784, + "epoch": 0.040646606990928806, + "loss/policy_avg": 0.19755011796951294, + "lr": 9.72967791411043e-06, + "objective/entropy": -57.290000915527344, + "objective/kl": 30.764808654785156, + "objective/non_score_reward": -1.5382404327392578, + "objective/rlhf_reward": -4.811326077490478, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 37.60175323486328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.67578125, + "step": 423, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990873336791992 + }, + { + "episode": 6800, + "epoch": 0.040742471630058355, + "loss/policy_avg": 0.2760317325592041, + "lr": 9.729038854805727e-06, + "objective/entropy": -54.2406005859375, + "objective/kl": 28.681961059570312, + "objective/non_score_reward": -1.4340981245040894, + "objective/rlhf_reward": -3.7889812094735458, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 13.514376640319824, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.548828125, + "step": 424, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0004029273986816 + }, + { + "episode": 6816, + "epoch": 0.040838336269187904, + "loss/policy_avg": 0.05885821580886841, + "lr": 9.728399795501023e-06, + "objective/entropy": -30.280364990234375, + "objective/kl": 31.102825164794922, + "objective/non_score_reward": -1.5551413297653198, + "objective/rlhf_reward": -4.820565319061279, + "objective/scores": 0.35, + "policy/approxkl_avg": 61.290470123291016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.767578125, + "step": 425, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986295700073242 + }, + { + "episode": 6832, + "epoch": 0.04093420090831745, + "loss/policy_avg": 0.044344570487737656, + "lr": 9.72776073619632e-06, + "objective/entropy": -223.16510009765625, + "objective/kl": 11.546382904052734, + "objective/non_score_reward": -0.5773191452026367, + "objective/rlhf_reward": -0.3618654114770252, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.5684561729431152, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7109375, + "step": 426, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0002474784851074 + }, + { + "episode": 6848, + "epoch": 0.041030065547447, + "loss/policy_avg": 0.11938305199146271, + "lr": 9.727121676891617e-06, + "objective/entropy": -84.6756362915039, + "objective/kl": 32.253173828125, + "objective/non_score_reward": -1.6126585006713867, + "objective/rlhf_reward": -5.000035624118194, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 54.86524963378906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.693359375, + "step": 427, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985135793685913 + }, + { + "episode": 6864, + "epoch": 0.04112593018657655, + "loss/policy_avg": -0.02704887092113495, + "lr": 9.726482617586912e-06, + "objective/entropy": 61.31664276123047, + "objective/kl": 50.535186767578125, + "objective/non_score_reward": -2.526759624481201, + "objective/rlhf_reward": -8.765402606039672, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 87.70621490478516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4794921875, + "step": 428, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0009117126464844 + }, + { + "episode": 6880, + "epoch": 0.0412217948257061, + "loss/policy_avg": 0.3563253581523895, + "lr": 9.72584355828221e-06, + "objective/entropy": -201.59555053710938, + "objective/kl": 26.542133331298828, + "objective/non_score_reward": -1.3271067142486572, + "objective/rlhf_reward": -2.384707783104154, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 12.606565475463867, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.60546875, + "step": 429, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991586208343506 + }, + { + "episode": 6896, + "epoch": 0.04131765946483565, + "loss/policy_avg": 0.3849369287490845, + "lr": 9.725204498977506e-06, + "objective/entropy": -172.11151123046875, + "objective/kl": 31.27842140197754, + "objective/non_score_reward": -1.5639209747314453, + "objective/rlhf_reward": -4.52235098282496, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 35.41864776611328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.794921875, + "step": 430, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9977457523345947 + }, + { + "episode": 6912, + "epoch": 0.0414135241039652, + "loss/policy_avg": 0.5410929918289185, + "lr": 9.724565439672803e-06, + "objective/entropy": -53.43696594238281, + "objective/kl": 36.75939178466797, + "objective/non_score_reward": -1.8379695415496826, + "objective/rlhf_reward": -5.229172053114448, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 11.017414093017578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.51953125, + "step": 431, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9961919784545898 + }, + { + "episode": 6928, + "epoch": 0.041509388743094754, + "loss/policy_avg": 0.5185568332672119, + "lr": 9.7239263803681e-06, + "objective/entropy": -42.49586486816406, + "objective/kl": 31.465147018432617, + "objective/non_score_reward": -1.5732574462890625, + "objective/rlhf_reward": -4.914427437869412, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 1.669852614402771, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62109375, + "step": 432, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998645782470703 + }, + { + "episode": 6944, + "epoch": 0.0416052533822243, + "loss/policy_avg": -0.09886922687292099, + "lr": 9.723287321063397e-06, + "objective/entropy": -182.28286743164062, + "objective/kl": 27.1431884765625, + "objective/non_score_reward": -1.3571594953536987, + "objective/rlhf_reward": -3.6953046480814615, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 25.096237182617188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 433, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0018882751464844 + }, + { + "episode": 6960, + "epoch": 0.04170111802135385, + "loss/policy_avg": 0.39349502325057983, + "lr": 9.722648261758692e-06, + "objective/entropy": 28.20358657836914, + "objective/kl": 38.92597198486328, + "objective/non_score_reward": -1.946298599243164, + "objective/rlhf_reward": -6.385194158554077, + "objective/scores": 0.35, + "policy/approxkl_avg": 46.153385162353516, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4912109375, + "step": 434, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992406368255615 + }, + { + "episode": 6976, + "epoch": 0.0417969826604834, + "loss/policy_avg": 0.3586619198322296, + "lr": 9.722009202453989e-06, + "objective/entropy": -126.02680206298828, + "objective/kl": 32.40974807739258, + "objective/non_score_reward": -1.6204874515533447, + "objective/rlhf_reward": -4.534538338856633, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 10.944326400756836, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.666015625, + "step": 435, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971150159835815 + }, + { + "episode": 6992, + "epoch": 0.04189284729961295, + "loss/policy_avg": -0.4687817692756653, + "lr": 9.721370143149284e-06, + "objective/entropy": -69.42359924316406, + "objective/kl": 20.10685157775879, + "objective/non_score_reward": -1.0053426027297974, + "objective/rlhf_reward": -2.6427683430291236, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 22.483867645263672, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6328125, + "step": 436, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.041172504425049 + }, + { + "episode": 7008, + "epoch": 0.0419887119387425, + "loss/policy_avg": 0.0906272605061531, + "lr": 9.720731083844581e-06, + "objective/entropy": -149.47274780273438, + "objective/kl": 26.28115463256836, + "objective/non_score_reward": -1.3140578269958496, + "objective/rlhf_reward": -3.1335249564805365, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.7223973274230957, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.599609375, + "step": 437, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000173568725586 + }, + { + "episode": 7024, + "epoch": 0.04208457657787205, + "loss/policy_avg": 0.3348531126976013, + "lr": 9.720092024539878e-06, + "objective/entropy": 22.56686782836914, + "objective/kl": 36.523582458496094, + "objective/non_score_reward": -1.8261791467666626, + "objective/rlhf_reward": -5.700596723620015, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 20.443164825439453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.515625, + "step": 438, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979515075683594 + }, + { + "episode": 7040, + "epoch": 0.042180441217001596, + "loss/policy_avg": 0.04725319519639015, + "lr": 9.719452965235175e-06, + "objective/entropy": -71.08361053466797, + "objective/kl": 20.915573120117188, + "objective/non_score_reward": -1.045778751373291, + "objective/rlhf_reward": -2.0604087731995917, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.088305473327637, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4462890625, + "step": 439, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0024590492248535 + }, + { + "episode": 7056, + "epoch": 0.042276305856131145, + "loss/policy_avg": 0.18381188809871674, + "lr": 9.718813905930472e-06, + "objective/entropy": 25.569873809814453, + "objective/kl": 38.07762145996094, + "objective/non_score_reward": -1.9038809537887573, + "objective/rlhf_reward": -3.215523815155029, + "objective/scores": 1.1, + "policy/approxkl_avg": 30.962854385375977, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62890625, + "step": 440, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0011448860168457 + }, + { + "episode": 7072, + "epoch": 0.042372170495260694, + "loss/policy_avg": 0.1967303454875946, + "lr": 9.718174846625767e-06, + "objective/entropy": -103.38803100585938, + "objective/kl": 29.222076416015625, + "objective/non_score_reward": -1.4611037969589233, + "objective/rlhf_reward": -4.240295205179768, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 8.899417877197266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.689453125, + "step": 441, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986486434936523 + }, + { + "episode": 7088, + "epoch": 0.04246803513439024, + "loss/policy_avg": -0.07635466754436493, + "lr": 9.717535787321064e-06, + "objective/entropy": -54.58887481689453, + "objective/kl": 35.043663024902344, + "objective/non_score_reward": -1.752183198928833, + "objective/rlhf_reward": -5.527779820378184, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 12.18149185180664, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.580078125, + "step": 442, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0013041496276855 + }, + { + "episode": 7104, + "epoch": 0.04256389977351979, + "loss/policy_avg": 0.3104819059371948, + "lr": 9.71689672801636e-06, + "objective/entropy": -53.842830657958984, + "objective/kl": 23.18008804321289, + "objective/non_score_reward": -1.1590044498443604, + "objective/rlhf_reward": -3.0797587921291143, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 86.82899475097656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.91796875, + "step": 443, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991846084594727 + }, + { + "episode": 7120, + "epoch": 0.04265976441264934, + "loss/policy_avg": 0.6317604780197144, + "lr": 9.716257668711657e-06, + "objective/entropy": -21.19356918334961, + "objective/kl": 30.069751739501953, + "objective/non_score_reward": -1.503487467765808, + "objective/rlhf_reward": -4.6353477025903285, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 128.40951538085938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.76953125, + "step": 444, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997659683227539 + }, + { + "episode": 7136, + "epoch": 0.04275562905177889, + "loss/policy_avg": 0.33194229006767273, + "lr": 9.715618609406954e-06, + "objective/entropy": -102.48907470703125, + "objective/kl": 32.374549865722656, + "objective/non_score_reward": -1.6187275648117065, + "objective/rlhf_reward": -6.474910318851471, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.681756973266602, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 445, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998705506324768 + }, + { + "episode": 7152, + "epoch": 0.04285149369090844, + "loss/policy_avg": 0.26850253343582153, + "lr": 9.714979550102251e-06, + "objective/entropy": 69.35136413574219, + "objective/kl": 26.097612380981445, + "objective/non_score_reward": -1.3048806190490723, + "objective/rlhf_reward": -3.738569977696299, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 62.56462097167969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6640625, + "step": 446, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99786376953125 + }, + { + "episode": 7168, + "epoch": 0.04294735833003799, + "loss/policy_avg": -0.1885017603635788, + "lr": 9.714340490797546e-06, + "objective/entropy": -16.98421859741211, + "objective/kl": 30.90627670288086, + "objective/non_score_reward": -1.5453139543533325, + "objective/rlhf_reward": -4.577135715548115, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 11.766645431518555, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.73828125, + "step": 447, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.003262996673584 + }, + { + "episode": 7184, + "epoch": 0.04304322296916754, + "loss/policy_avg": 0.24147900938987732, + "lr": 9.713701431492843e-06, + "objective/entropy": -196.87869262695312, + "objective/kl": 23.231670379638672, + "objective/non_score_reward": -1.161583662033081, + "objective/rlhf_reward": -3.1305624780976142, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 19.03369903564453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69140625, + "step": 448, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9996755123138428 + }, + { + "episode": 7200, + "epoch": 0.043139087608297086, + "loss/policy_avg": 0.3051467537879944, + "lr": 9.71306237218814e-06, + "objective/entropy": -54.2137565612793, + "objective/kl": 33.54918670654297, + "objective/non_score_reward": -1.6774592399597168, + "objective/rlhf_reward": -5.047977810323822, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 74.37176513671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.783203125, + "step": 449, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9967325925827026 + }, + { + "episode": 7216, + "epoch": 0.043234952247426635, + "loss/policy_avg": 0.0008301436901092529, + "lr": 9.712423312883437e-06, + "objective/entropy": -37.864322662353516, + "objective/kl": 24.052818298339844, + "objective/non_score_reward": -1.2026410102844238, + "objective/rlhf_reward": -2.9857349946823826, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 1.6498993635177612, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.525390625, + "step": 450, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001569986343384 + }, + { + "episode": 7232, + "epoch": 0.043330816886556184, + "loss/policy_avg": 0.10217726975679398, + "lr": 9.711784253578734e-06, + "objective/entropy": -97.12496948242188, + "objective/kl": 20.143707275390625, + "objective/non_score_reward": -1.007185459136963, + "objective/rlhf_reward": -2.669491672252102, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 37.34214401245117, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.783203125, + "step": 451, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993760585784912 + }, + { + "episode": 7248, + "epoch": 0.04342668152568573, + "loss/policy_avg": 0.2181258350610733, + "lr": 9.711145194274029e-06, + "objective/entropy": -187.07266235351562, + "objective/kl": 22.520824432373047, + "objective/non_score_reward": -1.1260414123535156, + "objective/rlhf_reward": -2.9000454283395585, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 80.40426635742188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 452, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000328540802002 + }, + { + "episode": 7264, + "epoch": 0.04352254616481528, + "loss/policy_avg": 0.28700706362724304, + "lr": 9.710506134969326e-06, + "objective/entropy": -119.91871643066406, + "objective/kl": 30.88311004638672, + "objective/non_score_reward": -1.5441553592681885, + "objective/rlhf_reward": -4.834986022024779, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 14.897968292236328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.787109375, + "step": 453, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9972997903823853 + }, + { + "episode": 7280, + "epoch": 0.04361841080394483, + "loss/policy_avg": 0.013649387285113335, + "lr": 9.709867075664623e-06, + "objective/entropy": -137.84861755371094, + "objective/kl": 35.624549865722656, + "objective/non_score_reward": -1.781227707862854, + "objective/rlhf_reward": -7.124910950660706, + "objective/scores": 0.0, + "policy/approxkl_avg": 77.14759826660156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.712890625, + "step": 454, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999969720840454 + }, + { + "episode": 7296, + "epoch": 0.04371427544307438, + "loss/policy_avg": 0.9055305123329163, + "lr": 9.70922801635992e-06, + "objective/entropy": -177.1896514892578, + "objective/kl": 34.19129943847656, + "objective/non_score_reward": -1.7095649242401123, + "objective/rlhf_reward": -5.387661199183807, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 51.92662811279297, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.66796875, + "step": 455, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9976481199264526 + }, + { + "episode": 7312, + "epoch": 0.04381014008220393, + "loss/policy_avg": -0.14486947655677795, + "lr": 9.708588957055215e-06, + "objective/entropy": -91.43609619140625, + "objective/kl": 30.12580108642578, + "objective/non_score_reward": -1.5062901973724365, + "objective/rlhf_reward": -4.509388887675938, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 24.85628890991211, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.611328125, + "step": 456, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.011446952819824 + }, + { + "episode": 7328, + "epoch": 0.04390600472133348, + "loss/policy_avg": 0.3115137815475464, + "lr": 9.707949897750512e-06, + "objective/entropy": -33.496673583984375, + "objective/kl": 24.4674072265625, + "objective/non_score_reward": -1.2233703136444092, + "objective/rlhf_reward": -3.377709650787052, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 9.057685852050781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.751953125, + "step": 457, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009520053863525 + }, + { + "episode": 7344, + "epoch": 0.044001869360463026, + "loss/policy_avg": 1.4892723560333252, + "lr": 9.707310838445809e-06, + "objective/entropy": -35.618934631347656, + "objective/kl": 27.64456558227539, + "objective/non_score_reward": -1.3822282552719116, + "objective/rlhf_reward": -3.5815017921494796, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.899414300918579, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.57421875, + "step": 458, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999825954437256 + }, + { + "episode": 7360, + "epoch": 0.044097733999592575, + "loss/policy_avg": 0.022264737635850906, + "lr": 9.706671779141105e-06, + "objective/entropy": 31.060089111328125, + "objective/kl": 34.85979461669922, + "objective/non_score_reward": -1.7429897785186768, + "objective/rlhf_reward": -5.367839369837361, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 7.1077799797058105, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.751953125, + "step": 459, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993565082550049 + }, + { + "episode": 7376, + "epoch": 0.044193598638722124, + "loss/policy_avg": 0.08219340443611145, + "lr": 9.7060327198364e-06, + "objective/entropy": -69.6414566040039, + "objective/kl": 35.42669677734375, + "objective/non_score_reward": -1.7713346481323242, + "objective/rlhf_reward": -5.726088785861416, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 21.27887535095215, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.712890625, + "step": 460, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0088043212890625 + }, + { + "episode": 7392, + "epoch": 0.04428946327785167, + "loss/policy_avg": 0.03685396909713745, + "lr": 9.705393660531698e-06, + "objective/entropy": -245.04380798339844, + "objective/kl": 21.42380142211914, + "objective/non_score_reward": -1.0711899995803833, + "objective/rlhf_reward": -2.1620538852372504, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.849046230316162, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.767578125, + "step": 461, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.008730411529541 + }, + { + "episode": 7408, + "epoch": 0.04438532791698122, + "loss/policy_avg": 0.5492111444473267, + "lr": 9.704754601226994e-06, + "objective/entropy": 9.25466537475586, + "objective/kl": 20.997167587280273, + "objective/non_score_reward": -1.0498583316802979, + "objective/rlhf_reward": -1.2757146700632302, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 36.03380584716797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9296875, + "step": 462, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000826120376587 + }, + { + "episode": 7424, + "epoch": 0.04448119255611077, + "loss/policy_avg": 0.22961178421974182, + "lr": 9.704115541922291e-06, + "objective/entropy": -2.9236984252929688, + "objective/kl": 26.89717674255371, + "objective/non_score_reward": -1.3448588848114014, + "objective/rlhf_reward": -3.717576061905013, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 133.2696075439453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8984375, + "step": 463, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999076008796692 + }, + { + "episode": 7440, + "epoch": 0.04457705719524032, + "loss/policy_avg": 0.1330358386039734, + "lr": 9.703476482617588e-06, + "objective/entropy": -155.3049774169922, + "objective/kl": 32.32700729370117, + "objective/non_score_reward": -1.6163502931594849, + "objective/rlhf_reward": -2.0654012918472286, + "objective/scores": 1.1, + "policy/approxkl_avg": 352.436767578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.86328125, + "step": 464, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9973843097686768 + }, + { + "episode": 7456, + "epoch": 0.04467292183436987, + "loss/policy_avg": 0.13191767036914825, + "lr": 9.702837423312883e-06, + "objective/entropy": -130.06350708007812, + "objective/kl": 31.98480987548828, + "objective/non_score_reward": -1.5992405414581299, + "objective/rlhf_reward": -5.07144889596097, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 4.149503707885742, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 465, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979965686798096 + }, + { + "episode": 7472, + "epoch": 0.04476878647349942, + "loss/policy_avg": 0.11230316013097763, + "lr": 9.70219836400818e-06, + "objective/entropy": 11.579151153564453, + "objective/kl": 34.1675910949707, + "objective/non_score_reward": -1.7083796262741089, + "objective/rlhf_reward": -5.3525657681778664, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 13.865779876708984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.708984375, + "step": 466, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00162410736084 + }, + { + "episode": 7488, + "epoch": 0.04486465111262897, + "loss/policy_avg": 0.2810555398464203, + "lr": 9.701559304703477e-06, + "objective/entropy": -138.13914489746094, + "objective/kl": 22.91815948486328, + "objective/non_score_reward": -1.145907998085022, + "objective/rlhf_reward": -3.205029585448605, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 97.98136901855469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.775390625, + "step": 467, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984302520751953 + }, + { + "episode": 7504, + "epoch": 0.044960515751758516, + "loss/policy_avg": -0.09679757058620453, + "lr": 9.700920245398774e-06, + "objective/entropy": -44.23152160644531, + "objective/kl": 34.52162170410156, + "objective/non_score_reward": -1.726081132888794, + "objective/rlhf_reward": -5.170991019407907, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 12.573694229125977, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3896484375, + "step": 468, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995930194854736 + }, + { + "episode": 7520, + "epoch": 0.045056380390888065, + "loss/policy_avg": 0.2740531265735626, + "lr": 9.700281186094071e-06, + "objective/entropy": -64.87997436523438, + "objective/kl": 30.31191062927246, + "objective/non_score_reward": -1.5155954360961914, + "objective/rlhf_reward": -4.329048738876978, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 12.677139282226562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.630859375, + "step": 469, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981954097747803 + }, + { + "episode": 7536, + "epoch": 0.045152245030017614, + "loss/policy_avg": 0.4849107265472412, + "lr": 9.699642126789368e-06, + "objective/entropy": -136.48355102539062, + "objective/kl": 20.618619918823242, + "objective/non_score_reward": -1.030930995941162, + "objective/rlhf_reward": -2.6998918845253863, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 95.56924438476562, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.607421875, + "step": 470, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9975948333740234 + }, + { + "episode": 7552, + "epoch": 0.04524810966914716, + "loss/policy_avg": 0.05032477527856827, + "lr": 9.699003067484663e-06, + "objective/entropy": -116.99330139160156, + "objective/kl": 31.927814483642578, + "objective/non_score_reward": -1.596390724182129, + "objective/rlhf_reward": -5.026312672828121, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.1943883895874023, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.744140625, + "step": 471, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0004005432128906 + }, + { + "episode": 7568, + "epoch": 0.04534397430827671, + "loss/policy_avg": 0.23768550157546997, + "lr": 9.69836400817996e-06, + "objective/entropy": -56.441200256347656, + "objective/kl": 35.956565856933594, + "objective/non_score_reward": -1.7978280782699585, + "objective/rlhf_reward": -5.587192330423909, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 18.25104522705078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.75390625, + "step": 472, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001668930053711 + }, + { + "episode": 7584, + "epoch": 0.04543983894740626, + "loss/policy_avg": 0.18428431451320648, + "lr": 9.697724948875257e-06, + "objective/entropy": -12.911811828613281, + "objective/kl": 31.440038681030273, + "objective/non_score_reward": -1.5720020532608032, + "objective/rlhf_reward": -4.554674939314523, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 33.68145751953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.673828125, + "step": 473, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997875690460205 + }, + { + "episode": 7600, + "epoch": 0.04553570358653581, + "loss/policy_avg": 1.0267724990844727, + "lr": 9.697085889570554e-06, + "objective/entropy": -155.81759643554688, + "objective/kl": 15.551814079284668, + "objective/non_score_reward": -0.7775906920433044, + "objective/rlhf_reward": -1.7317606593049586, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 3.7084851264953613, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.673828125, + "step": 474, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998857975006104 + }, + { + "episode": 7616, + "epoch": 0.04563156822566536, + "loss/policy_avg": 0.5301028490066528, + "lr": 9.69644683026585e-06, + "objective/entropy": -186.65789794921875, + "objective/kl": 37.16144561767578, + "objective/non_score_reward": -1.858072280883789, + "objective/rlhf_reward": -5.876029699054316, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 48.150047302246094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 475, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9972370862960815 + }, + { + "episode": 7632, + "epoch": 0.04572743286479491, + "loss/policy_avg": 0.2144310474395752, + "lr": 9.695807770961146e-06, + "objective/entropy": -153.16233825683594, + "objective/kl": 31.742645263671875, + "objective/non_score_reward": -1.5871323347091675, + "objective/rlhf_reward": -4.832757556232151, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 43.260581970214844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 476, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996619701385498 + }, + { + "episode": 7648, + "epoch": 0.04582329750392446, + "loss/policy_avg": 0.1423683762550354, + "lr": 9.695168711656443e-06, + "objective/entropy": -101.34695434570312, + "objective/kl": 34.40277099609375, + "objective/non_score_reward": -1.7201385498046875, + "objective/rlhf_reward": -5.555040988951845, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 6.133903503417969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.740234375, + "step": 477, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991512298583984 + }, + { + "episode": 7664, + "epoch": 0.045919162143054006, + "loss/policy_avg": -0.20567180216312408, + "lr": 9.694529652351738e-06, + "objective/entropy": 1.8477153778076172, + "objective/kl": 34.25542068481445, + "objective/non_score_reward": -1.7127711772918701, + "objective/rlhf_reward": -5.451084411144256, + "objective/scores": 0.35, + "policy/approxkl_avg": 90.96925354003906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.79296875, + "step": 478, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978113174438477 + }, + { + "episode": 7680, + "epoch": 0.046015026782183555, + "loss/policy_avg": 0.04285082221031189, + "lr": 9.693890593047035e-06, + "objective/entropy": -163.51800537109375, + "objective/kl": 39.76237487792969, + "objective/non_score_reward": -1.9881186485290527, + "objective/rlhf_reward": -6.47152245324409, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 41.795677185058594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.64453125, + "step": 479, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989352226257324 + }, + { + "episode": 7696, + "epoch": 0.046110891421313104, + "loss/policy_avg": 0.30679094791412354, + "lr": 9.693251533742331e-06, + "objective/entropy": -137.21139526367188, + "objective/kl": 24.817203521728516, + "objective/non_score_reward": -1.2408602237701416, + "objective/rlhf_reward": -3.407181172576502, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 7.010622024536133, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.740234375, + "step": 480, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998010516166687 + }, + { + "episode": 7712, + "epoch": 0.04620675606044265, + "loss/policy_avg": 0.14935311675071716, + "lr": 9.692612474437628e-06, + "objective/entropy": -133.61581420898438, + "objective/kl": 28.18117904663086, + "objective/non_score_reward": -1.4090590476989746, + "objective/rlhf_reward": -4.276986324523373, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 41.72409439086914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.720703125, + "step": 481, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9976277351379395 + }, + { + "episode": 7728, + "epoch": 0.0463026206995722, + "loss/policy_avg": 0.4503282606601715, + "lr": 9.691973415132925e-06, + "objective/entropy": -185.92971801757812, + "objective/kl": 24.44643783569336, + "objective/non_score_reward": -1.22232186794281, + "objective/rlhf_reward": -4.88928747177124, + "objective/scores": 0.0, + "policy/approxkl_avg": 26.91709327697754, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.55078125, + "step": 482, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986282587051392 + }, + { + "episode": 7744, + "epoch": 0.04639848533870175, + "loss/policy_avg": 0.7586182355880737, + "lr": 9.691334355828222e-06, + "objective/entropy": -136.83555603027344, + "objective/kl": 27.66883087158203, + "objective/non_score_reward": -1.38344144821167, + "objective/rlhf_reward": -3.41105959035543, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 39.446250915527344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4287109375, + "step": 483, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9947845935821533 + }, + { + "episode": 7760, + "epoch": 0.0464943499778313, + "loss/policy_avg": 0.47291696071624756, + "lr": 9.690695296523517e-06, + "objective/entropy": 10.135929107666016, + "objective/kl": 31.171567916870117, + "objective/non_score_reward": -1.558578372001648, + "objective/rlhf_reward": -4.572453921259033, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 15.718633651733398, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.611328125, + "step": 484, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997343897819519 + }, + { + "episode": 7776, + "epoch": 0.04659021461696085, + "loss/policy_avg": 0.19839856028556824, + "lr": 9.690056237218814e-06, + "objective/entropy": -64.7506332397461, + "objective/kl": 25.45448112487793, + "objective/non_score_reward": -1.2727241516113281, + "objective/rlhf_reward": -2.690896427631378, + "objective/scores": 0.6, + "policy/approxkl_avg": 29.054779052734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.76953125, + "step": 485, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977914094924927 + }, + { + "episode": 7792, + "epoch": 0.0466860792560904, + "loss/policy_avg": 0.16692940890789032, + "lr": 9.689417177914111e-06, + "objective/entropy": -200.1573028564453, + "objective/kl": 16.24359893798828, + "objective/non_score_reward": -0.8121800422668457, + "objective/rlhf_reward": -1.6446000672021683, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 3.7478506565093994, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.58984375, + "step": 486, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997503757476807 + }, + { + "episode": 7808, + "epoch": 0.046781943895219946, + "loss/policy_avg": 0.20832450687885284, + "lr": 9.688778118609408e-06, + "objective/entropy": -229.8734893798828, + "objective/kl": 24.610809326171875, + "objective/non_score_reward": -1.2305405139923096, + "objective/rlhf_reward": -3.3180417156854443, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 50.22547912597656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 487, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9977538585662842 + }, + { + "episode": 7824, + "epoch": 0.046877808534349495, + "loss/policy_avg": 0.584824800491333, + "lr": 9.688139059304705e-06, + "objective/entropy": -159.94088745117188, + "objective/kl": 32.78782653808594, + "objective/non_score_reward": -1.6393911838531494, + "objective/rlhf_reward": -5.041793072017368, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 53.52165985107422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4560546875, + "step": 488, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9974932670593262 + }, + { + "episode": 7840, + "epoch": 0.046973673173479044, + "loss/policy_avg": 0.10657641291618347, + "lr": 9.6875e-06, + "objective/entropy": -117.46031188964844, + "objective/kl": 22.680068969726562, + "objective/non_score_reward": -1.1340034008026123, + "objective/rlhf_reward": -2.802680269877116, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 31.437467575073242, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 489, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984140396118164 + }, + { + "episode": 7856, + "epoch": 0.0470695378126086, + "loss/policy_avg": 0.05225694179534912, + "lr": 9.686860940695297e-06, + "objective/entropy": -102.69722747802734, + "objective/kl": 35.890769958496094, + "objective/non_score_reward": -1.7945387363433838, + "objective/rlhf_reward": -2.7781547069549557, + "objective/scores": 1.1, + "policy/approxkl_avg": 8.238727569580078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.693359375, + "step": 490, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.996579885482788 + }, + { + "episode": 7872, + "epoch": 0.04716540245173815, + "loss/policy_avg": 0.3118276000022888, + "lr": 9.686221881390594e-06, + "objective/entropy": -42.73939895629883, + "objective/kl": 22.486095428466797, + "objective/non_score_reward": -1.1243047714233398, + "objective/rlhf_reward": -3.0733869268494525, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 29.32803726196289, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.609375, + "step": 491, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991399049758911 + }, + { + "episode": 7888, + "epoch": 0.0472612670908677, + "loss/policy_avg": 0.621738076210022, + "lr": 9.68558282208589e-06, + "objective/entropy": -26.77874755859375, + "objective/kl": 33.77405548095703, + "objective/non_score_reward": -1.688702940940857, + "objective/rlhf_reward": -5.198552160468653, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 9.273128509521484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73046875, + "step": 492, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988332986831665 + }, + { + "episode": 7904, + "epoch": 0.04735713172999725, + "loss/policy_avg": 0.16049662232398987, + "lr": 9.684943762781188e-06, + "objective/entropy": -84.04755401611328, + "objective/kl": 25.384605407714844, + "objective/non_score_reward": -1.2692303657531738, + "objective/rlhf_reward": -2.1532023891222205, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 0.7223958373069763, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.646484375, + "step": 493, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0005576610565186 + }, + { + "episode": 7920, + "epoch": 0.047452996369126796, + "loss/policy_avg": 0.3413264751434326, + "lr": 9.684304703476484e-06, + "objective/entropy": -118.85188293457031, + "objective/kl": 30.77880859375, + "objective/non_score_reward": -1.5389404296875, + "objective/rlhf_reward": -4.422428623835246, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 19.30898666381836, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 494, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997934103012085 + }, + { + "episode": 7936, + "epoch": 0.047548861008256345, + "loss/policy_avg": -0.016445789486169815, + "lr": 9.68366564417178e-06, + "objective/entropy": -211.39361572265625, + "objective/kl": 26.587682723999023, + "objective/non_score_reward": -1.3293840885162354, + "objective/rlhf_reward": -2.917536354064941, + "objective/scores": 0.6, + "policy/approxkl_avg": 50.449562072753906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5234375, + "step": 495, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99893319606781 + }, + { + "episode": 7952, + "epoch": 0.047644725647385894, + "loss/policy_avg": -0.2565712034702301, + "lr": 9.683026584867076e-06, + "objective/entropy": -49.41560363769531, + "objective/kl": 27.722068786621094, + "objective/non_score_reward": -1.3861035108566284, + "objective/rlhf_reward": -3.882554417074309, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 16.277629852294922, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.703125, + "step": 496, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.003061294555664 + }, + { + "episode": 7968, + "epoch": 0.04774059028651544, + "loss/policy_avg": 0.17001637816429138, + "lr": 9.682387525562373e-06, + "objective/entropy": -40.254676818847656, + "objective/kl": 25.527742385864258, + "objective/non_score_reward": -1.2763869762420654, + "objective/rlhf_reward": -5.10554826259613, + "objective/scores": 0.0, + "policy/approxkl_avg": 19.284744262695312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 497, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9972081184387207 + }, + { + "episode": 7984, + "epoch": 0.04783645492564499, + "loss/policy_avg": 0.08028728514909744, + "lr": 9.68174846625767e-06, + "objective/entropy": -23.79485321044922, + "objective/kl": 23.14282989501953, + "objective/non_score_reward": -1.1571415662765503, + "objective/rlhf_reward": -4.628566324710846, + "objective/scores": 0.0, + "policy/approxkl_avg": 25.781452178955078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4580078125, + "step": 498, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980249404907227 + }, + { + "episode": 8000, + "epoch": 0.04793231956477454, + "loss/policy_avg": 0.2174569070339203, + "lr": 9.681109406952967e-06, + "objective/entropy": -109.13389587402344, + "objective/kl": 36.64985656738281, + "objective/non_score_reward": -1.8324928283691406, + "objective/rlhf_reward": -5.951369323817593, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 27.508981704711914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.525390625, + "step": 499, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99745512008667 + }, + { + "episode": 8016, + "epoch": 0.04802818420390409, + "loss/policy_avg": 0.13631635904312134, + "lr": 9.680470347648262e-06, + "objective/entropy": -99.519775390625, + "objective/kl": 41.364810943603516, + "objective/non_score_reward": -2.0682406425476074, + "objective/rlhf_reward": -6.448134417804788, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 102.98858642578125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4970703125, + "step": 500, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998145341873169 + }, + { + "episode": 8032, + "epoch": 0.04812404884303364, + "loss/policy_avg": 0.059351589530706406, + "lr": 9.67983128834356e-06, + "objective/entropy": -226.86756896972656, + "objective/kl": 27.588150024414062, + "objective/non_score_reward": -1.379407525062561, + "objective/rlhf_reward": -4.001858436855015, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 6.536296844482422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65625, + "step": 501, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976544380187988 + }, + { + "episode": 8048, + "epoch": 0.04821991348216319, + "loss/policy_avg": 0.5408469438552856, + "lr": 9.679192229038854e-06, + "objective/entropy": 4.518913269042969, + "objective/kl": 37.552825927734375, + "objective/non_score_reward": -1.8776414394378662, + "objective/rlhf_reward": -5.777232364813486, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 8.410907745361328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.787109375, + "step": 502, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991774559020996 + }, + { + "episode": 8064, + "epoch": 0.048315778121292736, + "loss/policy_avg": 1.089150071144104, + "lr": 9.678553169734151e-06, + "objective/entropy": -70.22102355957031, + "objective/kl": 36.886138916015625, + "objective/non_score_reward": -1.8443071842193604, + "objective/rlhf_reward": -5.254522027746711, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 17.696430206298828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.662109375, + "step": 503, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9944283962249756 + }, + { + "episode": 8080, + "epoch": 0.048411642760422285, + "loss/policy_avg": 0.04815336689352989, + "lr": 9.677914110429448e-06, + "objective/entropy": -206.61251831054688, + "objective/kl": 19.784542083740234, + "objective/non_score_reward": -0.9892270565032959, + "objective/rlhf_reward": -2.4411365626179538, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 10.987642288208008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 504, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982428550720215 + }, + { + "episode": 8096, + "epoch": 0.048507507399551834, + "loss/policy_avg": 0.4511667788028717, + "lr": 9.677275051124745e-06, + "objective/entropy": -44.11040496826172, + "objective/kl": 32.054603576660156, + "objective/non_score_reward": -1.6027300357818604, + "objective/rlhf_reward": -4.8951483605229225, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 161.647705078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 505, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990203380584717 + }, + { + "episode": 8112, + "epoch": 0.04860337203868138, + "loss/policy_avg": 0.43728113174438477, + "lr": 9.676635991820042e-06, + "objective/entropy": -167.46401977539062, + "objective/kl": 25.358474731445312, + "objective/non_score_reward": -1.2679238319396973, + "objective/rlhf_reward": -5.071695148944855, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.505180358886719, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.67578125, + "step": 506, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999653697013855 + }, + { + "episode": 8128, + "epoch": 0.04869923667781093, + "loss/policy_avg": 0.049704909324645996, + "lr": 9.675996932515339e-06, + "objective/entropy": -68.84889221191406, + "objective/kl": 23.506563186645508, + "objective/non_score_reward": -1.1753281354904175, + "objective/rlhf_reward": -3.3227105523027003, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 3.8750853538513184, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.783203125, + "step": 507, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99936842918396 + }, + { + "episode": 8144, + "epoch": 0.04879510131694048, + "loss/policy_avg": 0.23126532137393951, + "lr": 9.675357873210634e-06, + "objective/entropy": -193.32493591308594, + "objective/kl": 30.975135803222656, + "objective/non_score_reward": -1.5487568378448486, + "objective/rlhf_reward": -4.072320940271888, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 30.721832275390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.599609375, + "step": 508, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.994474172592163 + }, + { + "episode": 8160, + "epoch": 0.04889096595607003, + "loss/policy_avg": 0.6136177778244019, + "lr": 9.67471881390593e-06, + "objective/entropy": 35.12611770629883, + "objective/kl": 24.636138916015625, + "objective/non_score_reward": -1.2318068742752075, + "objective/rlhf_reward": -2.979816268162663, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 31.945526123046875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.736328125, + "step": 509, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001587390899658 + }, + { + "episode": 8176, + "epoch": 0.04898683059519958, + "loss/policy_avg": 0.07654842734336853, + "lr": 9.674079754601228e-06, + "objective/entropy": -218.7822265625, + "objective/kl": 30.072967529296875, + "objective/non_score_reward": -1.5036484003067017, + "objective/rlhf_reward": -3.8918873689332347, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 42.21351623535156, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.763671875, + "step": 510, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9961347579956055 + }, + { + "episode": 8192, + "epoch": 0.04908269523432913, + "loss/policy_avg": 0.4642539322376251, + "lr": 9.673440695296525e-06, + "objective/entropy": -61.26002502441406, + "objective/kl": 28.09502410888672, + "objective/non_score_reward": -1.4047513008117676, + "objective/rlhf_reward": -4.168407420726165, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 28.139495849609375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.50390625, + "step": 511, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988558292388916 + }, + { + "episode": 8208, + "epoch": 0.04917855987345868, + "loss/policy_avg": -0.1496490240097046, + "lr": 9.672801635991821e-06, + "objective/entropy": -237.9604034423828, + "objective/kl": 24.80710220336914, + "objective/non_score_reward": -1.2403552532196045, + "objective/rlhf_reward": -3.5828184867776454, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 9.494747161865234, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.671875, + "step": 512, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000725507736206 + }, + { + "episode": 8224, + "epoch": 0.049274424512588226, + "loss/policy_avg": -0.18209466338157654, + "lr": 9.672162576687117e-06, + "objective/entropy": -180.66116333007812, + "objective/kl": 25.97962188720703, + "objective/non_score_reward": -1.2989810705184937, + "objective/rlhf_reward": -3.073217930571113, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 41.079193115234375, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.73046875, + "step": 513, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997527837753296 + }, + { + "episode": 8240, + "epoch": 0.049370289151717775, + "loss/policy_avg": 0.3504701852798462, + "lr": 9.671523517382413e-06, + "objective/entropy": -98.80787658691406, + "objective/kl": 26.576587677001953, + "objective/non_score_reward": -1.3288295269012451, + "objective/rlhf_reward": -0.9153180480003353, + "objective/scores": 1.1, + "policy/approxkl_avg": 13.758487701416016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6953125, + "step": 514, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.998854160308838 + }, + { + "episode": 8256, + "epoch": 0.049466153790847324, + "loss/policy_avg": 0.48611417412757874, + "lr": 9.67088445807771e-06, + "objective/entropy": -128.45774841308594, + "objective/kl": 29.784334182739258, + "objective/non_score_reward": -1.4892168045043945, + "objective/rlhf_reward": -4.223533527056375, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.2566263675689697, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 515, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001587867736816 + }, + { + "episode": 8272, + "epoch": 0.04956201842997687, + "loss/policy_avg": -0.13057222962379456, + "lr": 9.670245398773007e-06, + "objective/entropy": -146.07781982421875, + "objective/kl": 31.182106018066406, + "objective/non_score_reward": -1.5591052770614624, + "objective/rlhf_reward": -3.8364211082458493, + "objective/scores": 0.6, + "policy/approxkl_avg": 15.76829719543457, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.576171875, + "step": 516, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0000805854797363 + }, + { + "episode": 8288, + "epoch": 0.04965788306910642, + "loss/policy_avg": 0.637583315372467, + "lr": 9.669606339468304e-06, + "objective/entropy": -144.37762451171875, + "objective/kl": 27.648868560791016, + "objective/non_score_reward": -1.3824436664581299, + "objective/rlhf_reward": -4.0140026448094215, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 2.933715343475342, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.64453125, + "step": 517, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9995014667510986 + }, + { + "episode": 8304, + "epoch": 0.04975374770823597, + "loss/policy_avg": 0.23517751693725586, + "lr": 9.668967280163601e-06, + "objective/entropy": -130.0078125, + "objective/kl": 26.889904022216797, + "objective/non_score_reward": -1.344495415687561, + "objective/rlhf_reward": -3.927383343787536, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 35.43697738647461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.81640625, + "step": 518, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9984192848205566 + }, + { + "episode": 8320, + "epoch": 0.04984961234736552, + "loss/policy_avg": -0.05650443956255913, + "lr": 9.668328220858896e-06, + "objective/entropy": -214.1605682373047, + "objective/kl": 21.148624420166016, + "objective/non_score_reward": -1.0574312210083008, + "objective/rlhf_reward": -2.673465876784876, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 18.935588836669922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.599609375, + "step": 519, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993261098861694 + }, + { + "episode": 8336, + "epoch": 0.04994547698649507, + "loss/policy_avg": -0.034447960555553436, + "lr": 9.667689161554193e-06, + "objective/entropy": -158.14088439941406, + "objective/kl": 32.29146957397461, + "objective/non_score_reward": -1.61457359790802, + "objective/rlhf_reward": -4.902035086360529, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 6.876145362854004, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58984375, + "step": 520, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9993882179260254 + }, + { + "episode": 8352, + "epoch": 0.05004134162562462, + "loss/policy_avg": -0.13744737207889557, + "lr": 9.66705010224949e-06, + "objective/entropy": -204.13546752929688, + "objective/kl": 28.699504852294922, + "objective/non_score_reward": -1.4349753856658936, + "objective/rlhf_reward": -4.361299076167446, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 2.3828086853027344, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.66796875, + "step": 521, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0017244815826416 + }, + { + "episode": 8368, + "epoch": 0.05013720626475417, + "loss/policy_avg": 0.13512714207172394, + "lr": 9.666411042944787e-06, + "objective/entropy": -234.03375244140625, + "objective/kl": 27.24090576171875, + "objective/non_score_reward": -1.3620452880859375, + "objective/rlhf_reward": -3.932409131320652, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 27.1795654296875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.689453125, + "step": 522, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999112844467163 + }, + { + "episode": 8384, + "epoch": 0.050233070903883716, + "loss/policy_avg": -0.011349002830684185, + "lr": 9.665771983640082e-06, + "objective/entropy": -252.35935974121094, + "objective/kl": 35.68749237060547, + "objective/non_score_reward": -1.784374713897705, + "objective/rlhf_reward": -5.7588963890946925, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 13.969385147094727, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.626953125, + "step": 523, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9982863664627075 + }, + { + "episode": 8400, + "epoch": 0.050328935543013265, + "loss/policy_avg": 0.03610409051179886, + "lr": 9.665132924335379e-06, + "objective/entropy": -18.527732849121094, + "objective/kl": 31.889944076538086, + "objective/non_score_reward": -1.5944972038269043, + "objective/rlhf_reward": -4.927390317530975, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 69.35887145996094, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.86328125, + "step": 524, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999839425086975 + }, + { + "episode": 8416, + "epoch": 0.050424800182142814, + "loss/policy_avg": 0.4427942633628845, + "lr": 9.664493865030676e-06, + "objective/entropy": -203.7809295654297, + "objective/kl": 25.36702537536621, + "objective/non_score_reward": -1.2683511972427368, + "objective/rlhf_reward": -3.6495729281502642, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 22.38974380493164, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.828125, + "step": 525, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989018440246582 + }, + { + "episode": 8432, + "epoch": 0.05052066482127236, + "loss/policy_avg": 1.6773953437805176, + "lr": 9.663854805725971e-06, + "objective/entropy": -146.93841552734375, + "objective/kl": 37.069419860839844, + "objective/non_score_reward": -1.853471040725708, + "objective/rlhf_reward": -5.990052063663569, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 11.231493949890137, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.53515625, + "step": 526, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9981741905212402 + }, + { + "episode": 8448, + "epoch": 0.05061652946040191, + "loss/policy_avg": -0.08897572010755539, + "lr": 9.663215746421268e-06, + "objective/entropy": -158.65708923339844, + "objective/kl": 23.60004997253418, + "objective/non_score_reward": -1.1800025701522827, + "objective/rlhf_reward": -3.394497547179384, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 10.824882507324219, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6953125, + "step": 527, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9995331764221191 + }, + { + "episode": 8464, + "epoch": 0.05071239409953146, + "loss/policy_avg": 0.024341005831956863, + "lr": 9.662576687116565e-06, + "objective/entropy": -174.72035217285156, + "objective/kl": 29.104461669921875, + "objective/non_score_reward": -1.4552230834960938, + "objective/rlhf_reward": -4.479256918936401, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 17.054231643676758, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.470703125, + "step": 528, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999741554260254 + }, + { + "episode": 8480, + "epoch": 0.05080825873866101, + "loss/policy_avg": 0.257159948348999, + "lr": 9.661937627811862e-06, + "objective/entropy": -200.30184936523438, + "objective/kl": 23.69171905517578, + "objective/non_score_reward": -1.1845859289169312, + "objective/rlhf_reward": -3.338343775272369, + "objective/scores": 0.35, + "policy/approxkl_avg": 6.550008773803711, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.595703125, + "step": 529, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9984736442565918 + }, + { + "episode": 8496, + "epoch": 0.05090412337779056, + "loss/policy_avg": 0.4184650182723999, + "lr": 9.661298568507158e-06, + "objective/entropy": -344.7420959472656, + "objective/kl": 24.219188690185547, + "objective/non_score_reward": -1.2109594345092773, + "objective/rlhf_reward": -3.4652354205525935, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 67.58980560302734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 530, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9985125064849854 + }, + { + "episode": 8512, + "epoch": 0.05099998801692011, + "loss/policy_avg": -0.0187949538230896, + "lr": 9.660659509202455e-06, + "objective/entropy": -14.01883316040039, + "objective/kl": 29.49643325805664, + "objective/non_score_reward": -1.47482168674469, + "objective/rlhf_reward": -4.520684697715145, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 6.090343475341797, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.59765625, + "step": 531, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0014190673828125 + }, + { + "episode": 8528, + "epoch": 0.051095852656049656, + "loss/policy_avg": 0.5480527877807617, + "lr": 9.66002044989775e-06, + "objective/entropy": -169.82949829101562, + "objective/kl": 34.57899475097656, + "objective/non_score_reward": -1.728949785232544, + "objective/rlhf_reward": -5.434846642430186, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 7.255028247833252, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.638671875, + "step": 532, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971615076065063 + }, + { + "episode": 8544, + "epoch": 0.051191717295179205, + "loss/policy_avg": 0.2761814594268799, + "lr": 9.659381390593047e-06, + "objective/entropy": -100.77452850341797, + "objective/kl": 36.835365295410156, + "objective/non_score_reward": -1.8417682647705078, + "objective/rlhf_reward": -6.007823192809505, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 50.438026428222656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 533, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985384941101074 + }, + { + "episode": 8560, + "epoch": 0.051287581934308754, + "loss/policy_avg": 0.4119563698768616, + "lr": 9.658742331288344e-06, + "objective/entropy": -65.70556640625, + "objective/kl": 29.577213287353516, + "objective/non_score_reward": -1.47886061668396, + "objective/rlhf_reward": -3.792736174837623, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.75493049621582, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4990234375, + "step": 534, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002697467803955 + }, + { + "episode": 8576, + "epoch": 0.0513834465734383, + "loss/policy_avg": 0.12609338760375977, + "lr": 9.658103271983641e-06, + "objective/entropy": -150.71954345703125, + "objective/kl": 28.952709197998047, + "objective/non_score_reward": -1.447635531425476, + "objective/rlhf_reward": -4.3905422449111935, + "objective/scores": 0.35, + "policy/approxkl_avg": 34.924835205078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.615234375, + "step": 535, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004661083221436 + }, + { + "episode": 8592, + "epoch": 0.05147931121256785, + "loss/policy_avg": 0.014640828594565392, + "lr": 9.657464212678938e-06, + "objective/entropy": -37.74507141113281, + "objective/kl": 25.910266876220703, + "objective/non_score_reward": -1.295513391494751, + "objective/rlhf_reward": -0.7820532083511349, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.0191965103149414, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.751953125, + "step": 536, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0010244846343994 + }, + { + "episode": 8608, + "epoch": 0.0515751758516974, + "loss/policy_avg": 0.04429921880364418, + "lr": 9.656825153374235e-06, + "objective/entropy": -26.176483154296875, + "objective/kl": 32.8004264831543, + "objective/non_score_reward": -1.6400213241577148, + "objective/rlhf_reward": -4.826751814285913, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 42.128135681152344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.908203125, + "step": 537, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0013113021850586 + }, + { + "episode": 8624, + "epoch": 0.05167104049082695, + "loss/policy_avg": 0.46547916531562805, + "lr": 9.65618609406953e-06, + "objective/entropy": 7.776313781738281, + "objective/kl": 28.19791030883789, + "objective/non_score_reward": -1.4098955392837524, + "objective/rlhf_reward": -3.906248764197031, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 4.504173755645752, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8828125, + "step": 538, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998396635055542 + }, + { + "episode": 8640, + "epoch": 0.0517669051299565, + "loss/policy_avg": 0.0001214742660522461, + "lr": 9.655547034764827e-06, + "objective/entropy": -112.6850357055664, + "objective/kl": 31.756372451782227, + "objective/non_score_reward": -1.5878187417984009, + "objective/rlhf_reward": -4.228568734900032, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.7504100799560547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.546875, + "step": 539, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0001778602600098 + }, + { + "episode": 8656, + "epoch": 0.05186276976908605, + "loss/policy_avg": 0.41524794697761536, + "lr": 9.654907975460124e-06, + "objective/entropy": -135.01878356933594, + "objective/kl": 23.119266510009766, + "objective/non_score_reward": -1.1559633016586304, + "objective/rlhf_reward": -3.0675939609676153, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 26.581480026245117, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6015625, + "step": 540, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9962246417999268 + }, + { + "episode": 8672, + "epoch": 0.0519586344082156, + "loss/policy_avg": 0.3321428894996643, + "lr": 9.65426891615542e-06, + "objective/entropy": -5.44740104675293, + "objective/kl": 39.89240264892578, + "objective/non_score_reward": -1.9946203231811523, + "objective/rlhf_reward": -7.97848105430603, + "objective/scores": 0.0, + "policy/approxkl_avg": 67.52932739257812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 541, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9986319541931152 + }, + { + "episode": 8688, + "epoch": 0.052054499047345146, + "loss/policy_avg": 0.22704890370368958, + "lr": 9.653629856850718e-06, + "objective/entropy": 23.631000518798828, + "objective/kl": 22.43924331665039, + "objective/non_score_reward": -1.121962308883667, + "objective/rlhf_reward": -3.109246918050152, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 40.600868225097656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.708984375, + "step": 542, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0008883476257324 + }, + { + "episode": 8704, + "epoch": 0.052150363686474695, + "loss/policy_avg": 0.6167892217636108, + "lr": 9.652990797546013e-06, + "objective/entropy": 8.02947998046875, + "objective/kl": 34.78337478637695, + "objective/non_score_reward": -1.739168643951416, + "objective/rlhf_reward": -5.556674695014953, + "objective/scores": 0.35, + "policy/approxkl_avg": 7.763035774230957, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.55859375, + "step": 543, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983458518981934 + }, + { + "episode": 8720, + "epoch": 0.052246228325604244, + "loss/policy_avg": 0.1720658838748932, + "lr": 9.65235173824131e-06, + "objective/entropy": 0.5252876281738281, + "objective/kl": 31.73941993713379, + "objective/non_score_reward": -1.5869710445404053, + "objective/rlhf_reward": -4.79162499209936, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 6.366281509399414, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.76953125, + "step": 544, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988775253295898 + }, + { + "episode": 8736, + "epoch": 0.05234209296473379, + "loss/policy_avg": 0.07084909081459045, + "lr": 9.651712678936605e-06, + "objective/entropy": -50.734527587890625, + "objective/kl": 24.657032012939453, + "objective/non_score_reward": -1.2328516244888306, + "objective/rlhf_reward": -3.1065776899185886, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 12.337860107421875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.521484375, + "step": 545, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9985425472259521 + }, + { + "episode": 8752, + "epoch": 0.05243795760386334, + "loss/policy_avg": -0.053861357271671295, + "lr": 9.651073619631902e-06, + "objective/entropy": -242.29559326171875, + "objective/kl": 21.178913116455078, + "objective/non_score_reward": -1.058945655822754, + "objective/rlhf_reward": -2.6316629386583146, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 23.818538665771484, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.62109375, + "step": 546, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0112152099609375 + }, + { + "episode": 8768, + "epoch": 0.0525338222429929, + "loss/policy_avg": -0.008508548140525818, + "lr": 9.650434560327199e-06, + "objective/entropy": -46.92424011230469, + "objective/kl": 39.04132843017578, + "objective/non_score_reward": -1.952066421508789, + "objective/rlhf_reward": -6.429663398352963, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 15.27535629272461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4677734375, + "step": 547, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982926845550537 + }, + { + "episode": 8784, + "epoch": 0.052629686882122446, + "loss/policy_avg": 0.17654258012771606, + "lr": 9.649795501022496e-06, + "objective/entropy": -44.7242431640625, + "objective/kl": 19.804813385009766, + "objective/non_score_reward": -0.9902406930923462, + "objective/rlhf_reward": -2.635450038939638, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 39.75682067871094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.76953125, + "step": 548, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002223014831543 + }, + { + "episode": 8800, + "epoch": 0.052725551521251995, + "loss/policy_avg": 0.46367156505584717, + "lr": 9.649156441717792e-06, + "objective/entropy": -132.18556213378906, + "objective/kl": 38.18450927734375, + "objective/non_score_reward": -1.909225344657898, + "objective/rlhf_reward": -6.0327816343942455, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 24.263263702392578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7421875, + "step": 549, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974383115768433 + }, + { + "episode": 8816, + "epoch": 0.052821416160381544, + "loss/policy_avg": 0.2747136950492859, + "lr": 9.64851738241309e-06, + "objective/entropy": -91.26388549804688, + "objective/kl": 28.735111236572266, + "objective/non_score_reward": -1.4367555379867554, + "objective/rlhf_reward": -4.085162764013396, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 5.113122940063477, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.626953125, + "step": 550, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000535011291504 + }, + { + "episode": 8832, + "epoch": 0.05291728079951109, + "loss/policy_avg": 0.031243963167071342, + "lr": 9.647878323108384e-06, + "objective/entropy": -40.358192443847656, + "objective/kl": 31.673667907714844, + "objective/non_score_reward": -1.5836834907531738, + "objective/rlhf_reward": -4.993098309546142, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 78.17581939697266, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.796875, + "step": 551, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989157915115356 + }, + { + "episode": 8848, + "epoch": 0.05301314543864064, + "loss/policy_avg": -0.28017422556877136, + "lr": 9.647239263803681e-06, + "objective/entropy": -100.97856140136719, + "objective/kl": 33.18678283691406, + "objective/non_score_reward": -1.659339189529419, + "objective/rlhf_reward": -6.637356638908386, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.006505012512207, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.572265625, + "step": 552, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.003824234008789 + }, + { + "episode": 8864, + "epoch": 0.05310901007777019, + "loss/policy_avg": 0.04892890527844429, + "lr": 9.646600204498978e-06, + "objective/entropy": -136.31918334960938, + "objective/kl": 19.06879997253418, + "objective/non_score_reward": -0.9534400105476379, + "objective/rlhf_reward": -2.2575007369190008, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.5354987382888794, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.572265625, + "step": 553, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.003046989440918 + }, + { + "episode": 8880, + "epoch": 0.05320487471689974, + "loss/policy_avg": 0.1114959716796875, + "lr": 9.645961145194275e-06, + "objective/entropy": -125.14915466308594, + "objective/kl": 41.65575408935547, + "objective/non_score_reward": -2.0827877521514893, + "objective/rlhf_reward": -6.383740137295659, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 12.4759521484375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.66796875, + "step": 554, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973247051239014 + }, + { + "episode": 8896, + "epoch": 0.05330073935602929, + "loss/policy_avg": 0.2784144878387451, + "lr": 9.645322085889572e-06, + "objective/entropy": -42.213340759277344, + "objective/kl": 34.43170928955078, + "objective/non_score_reward": -1.7215855121612549, + "objective/rlhf_reward": -6.8863421976566315, + "objective/scores": 0.0, + "policy/approxkl_avg": 37.5791015625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.546875, + "step": 555, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974336624145508 + }, + { + "episode": 8912, + "epoch": 0.05339660399515884, + "loss/policy_avg": -0.0683375895023346, + "lr": 9.644683026584867e-06, + "objective/entropy": -94.292724609375, + "objective/kl": 29.925048828125, + "objective/non_score_reward": -1.4962522983551025, + "objective/rlhf_reward": -4.4287500669627935, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 0.9679741263389587, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.53125, + "step": 556, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002323627471924 + }, + { + "episode": 8928, + "epoch": 0.05349246863428839, + "loss/policy_avg": 0.3528517484664917, + "lr": 9.644043967280164e-06, + "objective/entropy": 100.1601791381836, + "objective/kl": 29.87194061279297, + "objective/non_score_reward": -1.4935970306396484, + "objective/rlhf_reward": -4.493435802872538, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 21.40321922302246, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.54296875, + "step": 557, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999911904335022 + }, + { + "episode": 8944, + "epoch": 0.053588333273417936, + "loss/policy_avg": 0.15664523839950562, + "lr": 9.643404907975461e-06, + "objective/entropy": -163.13458251953125, + "objective/kl": 43.485382080078125, + "objective/non_score_reward": -2.174269199371338, + "objective/rlhf_reward": -6.297076797485351, + "objective/scores": 0.6, + "policy/approxkl_avg": 28.333932876586914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.52734375, + "step": 558, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9964426755905151 + }, + { + "episode": 8960, + "epoch": 0.053684197912547485, + "loss/policy_avg": 0.6344835162162781, + "lr": 9.642765848670758e-06, + "objective/entropy": -252.752685546875, + "objective/kl": 33.16960144042969, + "objective/non_score_reward": -1.658479928970337, + "objective/rlhf_reward": -5.255317785827023, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 52.37012481689453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.771484375, + "step": 559, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0004310607910156 + }, + { + "episode": 8976, + "epoch": 0.053780062551677034, + "loss/policy_avg": 0.19869406521320343, + "lr": 9.642126789366055e-06, + "objective/entropy": -50.086647033691406, + "objective/kl": 30.926883697509766, + "objective/non_score_reward": -1.5463443994522095, + "objective/rlhf_reward": -4.629118292537287, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 26.995628356933594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.626953125, + "step": 560, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9950168132781982 + }, + { + "episode": 8992, + "epoch": 0.05387592719080658, + "loss/policy_avg": -0.010918349027633667, + "lr": 9.641487730061352e-06, + "objective/entropy": -168.9771728515625, + "objective/kl": 22.5106201171875, + "objective/non_score_reward": -1.1255309581756592, + "objective/rlhf_reward": -3.1604882984453733, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 20.162094116210938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.763671875, + "step": 561, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.001406669616699 + }, + { + "episode": 9008, + "epoch": 0.05397179182993613, + "loss/policy_avg": 0.4963573217391968, + "lr": 9.640848670756647e-06, + "objective/entropy": -159.58302307128906, + "objective/kl": 34.39787673950195, + "objective/non_score_reward": -1.7198940515518188, + "objective/rlhf_reward": -5.455743868549433, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 32.154441833496094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 562, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99989652633667 + }, + { + "episode": 9024, + "epoch": 0.05406765646906568, + "loss/policy_avg": 0.4512660503387451, + "lr": 9.640209611451944e-06, + "objective/entropy": -112.33628845214844, + "objective/kl": 34.371681213378906, + "objective/non_score_reward": -1.7185840606689453, + "objective/rlhf_reward": -5.515086495612545, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 4.578237056732178, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.599609375, + "step": 563, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984909296035767 + }, + { + "episode": 9040, + "epoch": 0.05416352110819523, + "loss/policy_avg": 0.08781366050243378, + "lr": 9.63957055214724e-06, + "objective/entropy": -39.49800491333008, + "objective/kl": 33.1617431640625, + "objective/non_score_reward": -1.6580872535705566, + "objective/rlhf_reward": -4.232348775863647, + "objective/scores": 0.6, + "policy/approxkl_avg": 4.19449520111084, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.677734375, + "step": 564, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000304698944092 + }, + { + "episode": 9056, + "epoch": 0.05425938574732478, + "loss/policy_avg": 0.02701903134584427, + "lr": 9.638931492842537e-06, + "objective/entropy": -135.10118103027344, + "objective/kl": 34.19304656982422, + "objective/non_score_reward": -1.7096521854400635, + "objective/rlhf_reward": -5.388010840030059, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 18.33478546142578, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.61328125, + "step": 565, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999070644378662 + }, + { + "episode": 9072, + "epoch": 0.05435525038645433, + "loss/policy_avg": 0.2804332375526428, + "lr": 9.638292433537834e-06, + "objective/entropy": -100.01052856445312, + "objective/kl": 28.388795852661133, + "objective/non_score_reward": -1.4194397926330566, + "objective/rlhf_reward": -5.677759170532227, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.587360382080078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 566, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0005486011505127 + }, + { + "episode": 9088, + "epoch": 0.05445111502558388, + "loss/policy_avg": 0.4314262866973877, + "lr": 9.63765337423313e-06, + "objective/entropy": -130.2495574951172, + "objective/kl": 35.38700866699219, + "objective/non_score_reward": -1.7693501710891724, + "objective/rlhf_reward": -5.4155414156323545, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 44.93388366699219, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.75390625, + "step": 567, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9968568086624146 + }, + { + "episode": 9104, + "epoch": 0.054546979664713426, + "loss/policy_avg": 0.3399587869644165, + "lr": 9.637014314928426e-06, + "objective/entropy": -247.61073303222656, + "objective/kl": 28.445119857788086, + "objective/non_score_reward": -1.4222559928894043, + "objective/rlhf_reward": -3.864195342334818, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 7.162724018096924, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.607421875, + "step": 568, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984673261642456 + }, + { + "episode": 9120, + "epoch": 0.054642844303842975, + "loss/policy_avg": 0.5520263314247131, + "lr": 9.636375255623721e-06, + "objective/entropy": -97.92376708984375, + "objective/kl": 26.055057525634766, + "objective/non_score_reward": -1.30275297164917, + "objective/rlhf_reward": -3.088305356279884, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 36.18694305419922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.494140625, + "step": 569, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0012941360473633 + }, + { + "episode": 9136, + "epoch": 0.054738708942972523, + "loss/policy_avg": 0.09734541922807693, + "lr": 9.635736196319018e-06, + "objective/entropy": -196.53872680664062, + "objective/kl": 23.71702003479004, + "objective/non_score_reward": -1.185850977897644, + "objective/rlhf_reward": -4.743403911590576, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.213500738143921, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.580078125, + "step": 570, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993475675582886 + }, + { + "episode": 9152, + "epoch": 0.05483457358210207, + "loss/policy_avg": 0.4516823887825012, + "lr": 9.635097137014315e-06, + "objective/entropy": -126.11761474609375, + "objective/kl": 28.336185455322266, + "objective/non_score_reward": -1.4168094396591187, + "objective/rlhf_reward": -1.2672375202178952, + "objective/scores": 1.1, + "policy/approxkl_avg": 44.684326171875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.775390625, + "step": 571, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990687370300293 + }, + { + "episode": 9168, + "epoch": 0.05493043822123162, + "loss/policy_avg": 0.34894299507141113, + "lr": 9.634458077709612e-06, + "objective/entropy": -3.410472869873047, + "objective/kl": 35.99509048461914, + "objective/non_score_reward": -1.7997545003890991, + "objective/rlhf_reward": -5.87350514891736, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 4.621858596801758, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 572, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0019896030426025 + }, + { + "episode": 9184, + "epoch": 0.05502630286036117, + "loss/policy_avg": 0.1023169457912445, + "lr": 9.633819018404909e-06, + "objective/entropy": -180.73724365234375, + "objective/kl": 24.693328857421875, + "objective/non_score_reward": -1.2346664667129517, + "objective/rlhf_reward": -3.5386658668518063, + "objective/scores": 0.35, + "policy/approxkl_avg": 22.89309310913086, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.64453125, + "step": 573, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981255531311035 + }, + { + "episode": 9200, + "epoch": 0.05512216749949072, + "loss/policy_avg": 0.2509443163871765, + "lr": 9.633179959100206e-06, + "objective/entropy": -268.43072509765625, + "objective/kl": 28.437435150146484, + "objective/non_score_reward": -1.4218716621398926, + "objective/rlhf_reward": -4.131227611508921, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 60.228729248046875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.78125, + "step": 574, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0000219345092773 + }, + { + "episode": 9216, + "epoch": 0.05521803213862027, + "loss/policy_avg": -0.04683633893728256, + "lr": 9.632540899795501e-06, + "objective/entropy": -70.71329498291016, + "objective/kl": 38.51101303100586, + "objective/non_score_reward": -1.9255508184432983, + "objective/rlhf_reward": -5.877374465736459, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 3.3532156944274902, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.615234375, + "step": 575, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000458002090454 + }, + { + "episode": 9232, + "epoch": 0.05531389677774982, + "loss/policy_avg": 0.25571292638778687, + "lr": 9.631901840490798e-06, + "objective/entropy": -197.88787841796875, + "objective/kl": 25.574037551879883, + "objective/non_score_reward": -1.278701901435852, + "objective/rlhf_reward": -3.3814741532007853, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 6.096738815307617, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.671875, + "step": 576, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001577854156494 + }, + { + "episode": 9248, + "epoch": 0.055409761416879366, + "loss/policy_avg": 0.7064580917358398, + "lr": 9.631262781186095e-06, + "objective/entropy": -150.29953002929688, + "objective/kl": 30.821884155273438, + "objective/non_score_reward": -1.5410943031311035, + "objective/rlhf_reward": -4.43104387919108, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 43.45115280151367, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.60546875, + "step": 577, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9980316162109375 + }, + { + "episode": 9264, + "epoch": 0.055505626056008915, + "loss/policy_avg": 0.20062510669231415, + "lr": 9.630623721881392e-06, + "objective/entropy": -158.88388061523438, + "objective/kl": 28.73421859741211, + "objective/non_score_reward": -1.4367109537124634, + "objective/rlhf_reward": -4.346843814849853, + "objective/scores": 0.35, + "policy/approxkl_avg": 12.110857963562012, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69140625, + "step": 578, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998002290725708 + }, + { + "episode": 9280, + "epoch": 0.055601490695138464, + "loss/policy_avg": 0.08450721949338913, + "lr": 9.629984662576689e-06, + "objective/entropy": -250.45445251464844, + "objective/kl": 27.57752227783203, + "objective/non_score_reward": -1.3788762092590332, + "objective/rlhf_reward": -4.064906816096649, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 17.175188064575195, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5859375, + "step": 579, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997262716293335 + }, + { + "episode": 9296, + "epoch": 0.05569735533426801, + "loss/policy_avg": 0.41482874751091003, + "lr": 9.629345603271984e-06, + "objective/entropy": -177.06607055664062, + "objective/kl": 29.43456268310547, + "objective/non_score_reward": -1.4717282056808472, + "objective/rlhf_reward": -2.9631939872514934, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 50.86977005004883, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.595703125, + "step": 580, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0005576610565186 + }, + { + "episode": 9312, + "epoch": 0.05579321997339756, + "loss/policy_avg": 0.20043331384658813, + "lr": 9.62870654396728e-06, + "objective/entropy": -224.79660034179688, + "objective/kl": 23.171340942382812, + "objective/non_score_reward": -1.1585670709609985, + "objective/rlhf_reward": -2.6868569953011825, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.841948986053467, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.603515625, + "step": 581, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.00144362449646 + }, + { + "episode": 9328, + "epoch": 0.05588908461252711, + "loss/policy_avg": 0.28447139263153076, + "lr": 9.628067484662578e-06, + "objective/entropy": -44.1309814453125, + "objective/kl": 42.387351989746094, + "objective/non_score_reward": -2.1193675994873047, + "objective/rlhf_reward": -7.151957724124117, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 20.72610092163086, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.419921875, + "step": 582, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971909523010254 + }, + { + "episode": 9344, + "epoch": 0.05598494925165666, + "loss/policy_avg": 0.09533769637346268, + "lr": 9.627428425357874e-06, + "objective/entropy": -218.9058380126953, + "objective/kl": 27.360652923583984, + "objective/non_score_reward": -1.368032693862915, + "objective/rlhf_reward": -4.021532396884307, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 11.28432846069336, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 583, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9974052906036377 + }, + { + "episode": 9360, + "epoch": 0.05608081389078621, + "loss/policy_avg": 0.5065032243728638, + "lr": 9.626789366053171e-06, + "objective/entropy": -231.38427734375, + "objective/kl": 32.08224105834961, + "objective/non_score_reward": -1.604112148284912, + "objective/rlhf_reward": -5.0748127012545154, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 40.948760986328125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.69921875, + "step": 584, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9988150596618652 + }, + { + "episode": 9376, + "epoch": 0.05617667852991576, + "loss/policy_avg": 0.6530688405036926, + "lr": 9.626150306748468e-06, + "objective/entropy": -116.65798950195312, + "objective/kl": 31.407730102539062, + "objective/non_score_reward": -1.570386528968811, + "objective/rlhf_reward": -4.902944007006985, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 13.348186492919922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.54296875, + "step": 585, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000188112258911 + }, + { + "episode": 9392, + "epoch": 0.05627254316904531, + "loss/policy_avg": -0.06093317270278931, + "lr": 9.625511247443763e-06, + "objective/entropy": -245.7208251953125, + "objective/kl": 22.28873634338379, + "objective/non_score_reward": -1.1144368648529053, + "objective/rlhf_reward": -2.33504098869947, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.7080774307250977, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.681640625, + "step": 586, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002075672149658 + }, + { + "episode": 9408, + "epoch": 0.056368407808174856, + "loss/policy_avg": 0.4493389129638672, + "lr": 9.62487218813906e-06, + "objective/entropy": -11.156410217285156, + "objective/kl": 29.71312141418457, + "objective/non_score_reward": -1.4856561422348022, + "objective/rlhf_reward": -4.117795641693186, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 18.012893676757812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.759765625, + "step": 587, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000352382659912 + }, + { + "episode": 9424, + "epoch": 0.056464272447304405, + "loss/policy_avg": 0.3274408280849457, + "lr": 9.624233128834357e-06, + "objective/entropy": -116.3506088256836, + "objective/kl": 35.94437026977539, + "objective/non_score_reward": -1.7972185611724854, + "objective/rlhf_reward": -4.788874185085296, + "objective/scores": 0.6, + "policy/approxkl_avg": 17.158645629882812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69140625, + "step": 588, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996552467346191 + }, + { + "episode": 9440, + "epoch": 0.056560137086433954, + "loss/policy_avg": 0.879096508026123, + "lr": 9.623594069529654e-06, + "objective/entropy": -152.50155639648438, + "objective/kl": 32.464576721191406, + "objective/non_score_reward": -1.623228669166565, + "objective/rlhf_reward": -5.069082756240931, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 70.49058532714844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.849609375, + "step": 589, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001586437225342 + }, + { + "episode": 9456, + "epoch": 0.0566560017255635, + "loss/policy_avg": 0.2921786904335022, + "lr": 9.62295501022495e-06, + "objective/entropy": -177.27088928222656, + "objective/kl": 39.783531188964844, + "objective/non_score_reward": -1.989176630973816, + "objective/rlhf_reward": -6.57810423621307, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 77.26689147949219, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6796875, + "step": 590, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989970922470093 + }, + { + "episode": 9472, + "epoch": 0.05675186636469305, + "loss/policy_avg": 0.3912142515182495, + "lr": 9.622315950920246e-06, + "objective/entropy": -120.1540756225586, + "objective/kl": 31.21270179748535, + "objective/non_score_reward": -1.5606350898742676, + "objective/rlhf_reward": -3.842540299892425, + "objective/scores": 0.6, + "policy/approxkl_avg": 25.256790161132812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.654296875, + "step": 591, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9980988502502441 + }, + { + "episode": 9488, + "epoch": 0.0568477310038226, + "loss/policy_avg": 0.04369340091943741, + "lr": 9.621676891615543e-06, + "objective/entropy": -277.40753173828125, + "objective/kl": 29.685585021972656, + "objective/non_score_reward": -1.4842792749404907, + "objective/rlhf_reward": -1.5371170997619625, + "objective/scores": 1.1, + "policy/approxkl_avg": 7.890674591064453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.677734375, + "step": 592, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981930255889893 + }, + { + "episode": 9504, + "epoch": 0.05694359564295215, + "loss/policy_avg": 0.05721379816532135, + "lr": 9.621037832310838e-06, + "objective/entropy": -257.69232177734375, + "objective/kl": 23.966060638427734, + "objective/non_score_reward": -1.19830322265625, + "objective/rlhf_reward": -3.0598793412248293, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 20.133102416992188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.732421875, + "step": 593, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001232624053955 + }, + { + "episode": 9520, + "epoch": 0.0570394602820817, + "loss/policy_avg": 0.5772296786308289, + "lr": 9.620398773006135e-06, + "objective/entropy": -89.6330795288086, + "objective/kl": 31.078372955322266, + "objective/non_score_reward": -1.5539186000823975, + "objective/rlhf_reward": -4.734722021038889, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 21.1763916015625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.87109375, + "step": 594, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000513792037964 + }, + { + "episode": 9536, + "epoch": 0.05713532492121125, + "loss/policy_avg": -0.026315592229366302, + "lr": 9.619759713701432e-06, + "objective/entropy": -219.30979919433594, + "objective/kl": 26.461135864257812, + "objective/non_score_reward": -1.323056697845459, + "objective/rlhf_reward": -3.9329772827371787, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 8.585318565368652, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.677734375, + "step": 595, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0008788108825684 + }, + { + "episode": 9552, + "epoch": 0.057231189560340796, + "loss/policy_avg": 0.2548080384731293, + "lr": 9.619120654396729e-06, + "objective/entropy": -37.27716827392578, + "objective/kl": 44.03446960449219, + "objective/non_score_reward": -2.201723575592041, + "objective/rlhf_reward": -7.356295923800811, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 21.06201934814453, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.63671875, + "step": 596, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991514682769775 + }, + { + "episode": 9568, + "epoch": 0.057327054199470345, + "loss/policy_avg": 2.5911049842834473, + "lr": 9.618481595092026e-06, + "objective/entropy": -171.7782745361328, + "objective/kl": 20.800029754638672, + "objective/non_score_reward": -1.0400015115737915, + "objective/rlhf_reward": -1.760006046295166, + "objective/scores": 0.6, + "policy/approxkl_avg": 2.9469943046569824, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 597, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.014529228210449 + }, + { + "episode": 9584, + "epoch": 0.057422918838599894, + "loss/policy_avg": -0.1166892945766449, + "lr": 9.617842535787323e-06, + "objective/entropy": -109.67333221435547, + "objective/kl": 34.37934494018555, + "objective/non_score_reward": -1.7189671993255615, + "objective/rlhf_reward": -6.8758686780929565, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.377391815185547, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4794921875, + "step": 598, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002119541168213 + }, + { + "episode": 9600, + "epoch": 0.05751878347772944, + "loss/policy_avg": -0.15396325290203094, + "lr": 9.617203476482618e-06, + "objective/entropy": -128.05728149414062, + "objective/kl": 29.42688751220703, + "objective/non_score_reward": -1.4713443517684937, + "objective/rlhf_reward": -4.060548658641886, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 2.408236026763916, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.71484375, + "step": 599, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.002739429473877 + }, + { + "episode": 9616, + "epoch": 0.05761464811685899, + "loss/policy_avg": 0.14407247304916382, + "lr": 9.616564417177915e-06, + "objective/entropy": -272.3529357910156, + "objective/kl": 21.596874237060547, + "objective/non_score_reward": -1.0798437595367432, + "objective/rlhf_reward": -1.3956560238611426, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.104412078857422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58984375, + "step": 600, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001608371734619 + }, + { + "episode": 9632, + "epoch": 0.05771051275598854, + "loss/policy_avg": 0.20445303618907928, + "lr": 9.615925357873211e-06, + "objective/entropy": -291.0384521484375, + "objective/kl": 28.06856918334961, + "objective/non_score_reward": -1.403428554534912, + "objective/rlhf_reward": -4.235111692038876, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 9.333198547363281, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.720703125, + "step": 601, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0009381771087646 + }, + { + "episode": 9648, + "epoch": 0.05780637739511809, + "loss/policy_avg": 0.7656448483467102, + "lr": 9.615286298568508e-06, + "objective/entropy": -4.355806350708008, + "objective/kl": 34.863006591796875, + "objective/non_score_reward": -1.7431503534317017, + "objective/rlhf_reward": -5.548769433696833, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 10.645190238952637, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.591796875, + "step": 602, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9971041679382324 + }, + { + "episode": 9664, + "epoch": 0.05790224203424764, + "loss/policy_avg": 0.1100698709487915, + "lr": 9.614647239263805e-06, + "objective/entropy": -203.49618530273438, + "objective/kl": 19.046649932861328, + "objective/non_score_reward": -0.9523325562477112, + "objective/rlhf_reward": -2.4307281161225855, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 0.499467670917511, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.638671875, + "step": 603, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0019733905792236 + }, + { + "episode": 9680, + "epoch": 0.05799810667337719, + "loss/policy_avg": 0.17878472805023193, + "lr": 9.6140081799591e-06, + "objective/entropy": -162.996826171875, + "objective/kl": 23.458127975463867, + "objective/non_score_reward": -1.172906517982483, + "objective/rlhf_reward": -3.3661131596862504, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 8.434497833251953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5390625, + "step": 604, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9986371994018555 + }, + { + "episode": 9696, + "epoch": 0.058093971312506744, + "loss/policy_avg": 0.5608217716217041, + "lr": 9.613369120654397e-06, + "objective/entropy": -168.91802978515625, + "objective/kl": 31.90495491027832, + "objective/non_score_reward": -1.5952478647232056, + "objective/rlhf_reward": -3.4572724446069927, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 10.658321380615234, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.57421875, + "step": 605, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999322772026062 + }, + { + "episode": 9712, + "epoch": 0.05818983595163629, + "loss/policy_avg": 0.10194225609302521, + "lr": 9.612730061349694e-06, + "objective/entropy": -138.00286865234375, + "objective/kl": 34.8355712890625, + "objective/non_score_reward": -1.7417783737182617, + "objective/rlhf_reward": -5.641600999861879, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 19.823665618896484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.765625, + "step": 606, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000347852706909 + }, + { + "episode": 9728, + "epoch": 0.05828570059076584, + "loss/policy_avg": 1.170401930809021, + "lr": 9.612091002044991e-06, + "objective/entropy": -171.179443359375, + "objective/kl": 23.883764266967773, + "objective/non_score_reward": -1.1941882371902466, + "objective/rlhf_reward": -3.2609813449704017, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 1.674392819404602, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.654296875, + "step": 607, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004193782806396 + }, + { + "episode": 9744, + "epoch": 0.05838156522989539, + "loss/policy_avg": 0.05054464191198349, + "lr": 9.611451942740288e-06, + "objective/entropy": -196.56436157226562, + "objective/kl": 23.218883514404297, + "objective/non_score_reward": -1.1609442234039307, + "objective/rlhf_reward": -3.1931789918855276, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 11.145727157592773, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.60546875, + "step": 608, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998692512512207 + }, + { + "episode": 9760, + "epoch": 0.05847742986902494, + "loss/policy_avg": 0.054385945200920105, + "lr": 9.610812883435585e-06, + "objective/entropy": -244.93141174316406, + "objective/kl": 29.985477447509766, + "objective/non_score_reward": -1.4992740154266357, + "objective/rlhf_reward": -4.637845957015438, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 19.703460693359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69140625, + "step": 609, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0000925064086914 + }, + { + "episode": 9776, + "epoch": 0.05857329450815449, + "loss/policy_avg": -0.05685323104262352, + "lr": 9.61017382413088e-06, + "objective/entropy": -65.63417053222656, + "objective/kl": 31.53623390197754, + "objective/non_score_reward": -1.5768117904663086, + "objective/rlhf_reward": -3.383528147579405, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 12.860790252685547, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.59375, + "step": 610, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001614809036255 + }, + { + "episode": 9792, + "epoch": 0.05866915914728404, + "loss/policy_avg": 0.20876801013946533, + "lr": 9.609534764826177e-06, + "objective/entropy": -112.53227996826172, + "objective/kl": 41.12568664550781, + "objective/non_score_reward": -2.0562844276428223, + "objective/rlhf_reward": -5.825137710571289, + "objective/scores": 0.6, + "policy/approxkl_avg": 33.385337829589844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.626953125, + "step": 611, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000166654586792 + }, + { + "episode": 9808, + "epoch": 0.05876502378641359, + "loss/policy_avg": 0.2722185552120209, + "lr": 9.608895705521472e-06, + "objective/entropy": -124.71205139160156, + "objective/kl": 38.9796257019043, + "objective/non_score_reward": -1.9489812850952148, + "objective/rlhf_reward": -5.395925498008728, + "objective/scores": 0.6, + "policy/approxkl_avg": 19.52260971069336, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.623046875, + "step": 612, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988607168197632 + }, + { + "episode": 9824, + "epoch": 0.058860888425543136, + "loss/policy_avg": 0.7936792969703674, + "lr": 9.608256646216769e-06, + "objective/entropy": -150.9628448486328, + "objective/kl": 32.946922302246094, + "objective/non_score_reward": -1.6473462581634521, + "objective/rlhf_reward": -5.165553171833125, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 23.228769302368164, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.537109375, + "step": 613, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001022815704346 + }, + { + "episode": 9840, + "epoch": 0.058956753064672685, + "loss/policy_avg": 0.8288295269012451, + "lr": 9.607617586912066e-06, + "objective/entropy": -145.37136840820312, + "objective/kl": 37.17048645019531, + "objective/non_score_reward": -1.8585245609283447, + "objective/rlhf_reward": -5.6092691376534205, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 8.95422077178955, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.568359375, + "step": 614, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995349645614624 + }, + { + "episode": 9856, + "epoch": 0.05905261770380223, + "loss/policy_avg": 0.19199243187904358, + "lr": 9.606978527607363e-06, + "objective/entropy": -158.26043701171875, + "objective/kl": 31.016521453857422, + "objective/non_score_reward": -1.550826072692871, + "objective/rlhf_reward": -4.8440544244989585, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.4004452228546143, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5546875, + "step": 615, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00236439704895 + }, + { + "episode": 9872, + "epoch": 0.05914848234293178, + "loss/policy_avg": 0.29752206802368164, + "lr": 9.60633946830266e-06, + "objective/entropy": -141.43800354003906, + "objective/kl": 27.8808536529541, + "objective/non_score_reward": -1.394042730331421, + "objective/rlhf_reward": -3.842837558190028, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 10.629474639892578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5078125, + "step": 616, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00028657913208 + }, + { + "episode": 9888, + "epoch": 0.05924434698206133, + "loss/policy_avg": 0.2227097749710083, + "lr": 9.605700408997955e-06, + "objective/entropy": -97.0810775756836, + "objective/kl": 34.3601188659668, + "objective/non_score_reward": -1.718005895614624, + "objective/rlhf_reward": -5.4481916024285235, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 16.432331085205078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.548828125, + "step": 617, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975783824920654 + }, + { + "episode": 9904, + "epoch": 0.05934021162119088, + "loss/policy_avg": 0.17975842952728271, + "lr": 9.605061349693252e-06, + "objective/entropy": -200.100830078125, + "objective/kl": 28.51620864868164, + "objective/non_score_reward": -1.4258103370666504, + "objective/rlhf_reward": -3.8784127190438022, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 10.591612815856934, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 618, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0000221729278564 + }, + { + "episode": 9920, + "epoch": 0.05943607626032043, + "loss/policy_avg": 0.4452857971191406, + "lr": 9.604422290388548e-06, + "objective/entropy": -87.9361572265625, + "objective/kl": 34.174217224121094, + "objective/non_score_reward": -1.7087109088897705, + "objective/rlhf_reward": -5.278584449496821, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 24.203800201416016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.587890625, + "step": 619, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989922046661377 + }, + { + "episode": 9936, + "epoch": 0.05953194089944998, + "loss/policy_avg": 0.31785786151885986, + "lr": 9.603783231083845e-06, + "objective/entropy": -56.93491744995117, + "objective/kl": 34.28547286987305, + "objective/non_score_reward": -1.7142736911773682, + "objective/rlhf_reward": -5.032265897068094, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 12.636474609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.85546875, + "step": 620, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.996368408203125 + }, + { + "episode": 9952, + "epoch": 0.05962780553857953, + "loss/policy_avg": 0.6350647211074829, + "lr": 9.603144171779142e-06, + "objective/entropy": -129.3587188720703, + "objective/kl": 41.710655212402344, + "objective/non_score_reward": -2.0855326652526855, + "objective/rlhf_reward": -6.219424667135749, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 11.748146057128906, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4775390625, + "step": 621, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9979774951934814 + }, + { + "episode": 9968, + "epoch": 0.059723670177709076, + "loss/policy_avg": 0.9843254089355469, + "lr": 9.602505112474439e-06, + "objective/entropy": -95.34288024902344, + "objective/kl": 49.37370300292969, + "objective/non_score_reward": -2.4686851501464844, + "objective/rlhf_reward": -8.049912209781716, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 31.02006721496582, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4970703125, + "step": 622, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9965976476669312 + }, + { + "episode": 9984, + "epoch": 0.059819534816838625, + "loss/policy_avg": 0.6165390610694885, + "lr": 9.601866053169734e-06, + "objective/entropy": -100.56966400146484, + "objective/kl": 33.22990036010742, + "objective/non_score_reward": -1.6614950895309448, + "objective/rlhf_reward": -5.286730491851253, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 13.85442066192627, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.61328125, + "step": 623, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971177577972412 + }, + { + "episode": 10000, + "epoch": 0.059915399455968174, + "loss/policy_avg": 0.3318287134170532, + "lr": 9.601226993865031e-06, + "objective/entropy": -212.1555938720703, + "objective/kl": 25.822668075561523, + "objective/non_score_reward": -1.2911334037780762, + "objective/rlhf_reward": -2.2408145412218303, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 1.2788864374160767, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.525390625, + "step": 624, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9995461702346802 + }, + { + "episode": 10016, + "epoch": 0.06001126409509772, + "loss/policy_avg": 0.35671815276145935, + "lr": 9.600587934560328e-06, + "objective/entropy": -96.60403442382812, + "objective/kl": 42.28247833251953, + "objective/non_score_reward": -2.114124059677124, + "objective/rlhf_reward": -6.6316679671135645, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 9.525958061218262, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.55859375, + "step": 625, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999858021736145 + }, + { + "episode": 10032, + "epoch": 0.06010712873422727, + "loss/policy_avg": 0.026430530473589897, + "lr": 9.599948875255625e-06, + "objective/entropy": -96.45112609863281, + "objective/kl": 30.055763244628906, + "objective/non_score_reward": -1.5027881860733032, + "objective/rlhf_reward": -4.56055448493515, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 6.234503746032715, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.521484375, + "step": 626, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002078056335449 + }, + { + "episode": 10048, + "epoch": 0.06020299337335682, + "loss/policy_avg": -0.07770150899887085, + "lr": 9.599309815950922e-06, + "objective/entropy": -78.50785827636719, + "objective/kl": 33.19765090942383, + "objective/non_score_reward": -1.6598827838897705, + "objective/rlhf_reward": -5.158578279431223, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 60.745849609375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5546875, + "step": 627, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0026586055755615 + }, + { + "episode": 10064, + "epoch": 0.06029885801248637, + "loss/policy_avg": 0.045525066554546356, + "lr": 9.598670756646217e-06, + "objective/entropy": -207.98727416992188, + "objective/kl": 34.44676208496094, + "objective/non_score_reward": -1.7223379611968994, + "objective/rlhf_reward": -5.489351963996887, + "objective/scores": 0.35, + "policy/approxkl_avg": 2.952592372894287, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.671875, + "step": 628, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989871978759766 + }, + { + "episode": 10080, + "epoch": 0.06039472265161592, + "loss/policy_avg": 0.32521092891693115, + "lr": 9.598031697341514e-06, + "objective/entropy": -71.00718688964844, + "objective/kl": 27.00582504272461, + "objective/non_score_reward": -1.3502912521362305, + "objective/rlhf_reward": -3.977332849701015, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 5.865281105041504, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7578125, + "step": 629, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001133918762207 + }, + { + "episode": 10096, + "epoch": 0.06049058729074547, + "loss/policy_avg": 0.22257700562477112, + "lr": 9.59739263803681e-06, + "objective/entropy": -87.40052795410156, + "objective/kl": 31.356922149658203, + "objective/non_score_reward": -1.5678460597991943, + "objective/rlhf_reward": -4.32397324867719, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 27.549453735351562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.546875, + "step": 630, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999577522277832 + }, + { + "episode": 10112, + "epoch": 0.06058645192987502, + "loss/policy_avg": 0.4591647982597351, + "lr": 9.596753578732108e-06, + "objective/entropy": -35.01010513305664, + "objective/kl": 28.93059539794922, + "objective/non_score_reward": -1.4465298652648926, + "objective/rlhf_reward": -4.42686941597311, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 10.006196975708008, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8203125, + "step": 631, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9968595504760742 + }, + { + "episode": 10128, + "epoch": 0.060682316569004566, + "loss/policy_avg": 0.9483177661895752, + "lr": 9.596114519427405e-06, + "objective/entropy": -152.91030883789062, + "objective/kl": 30.360069274902344, + "objective/non_score_reward": -1.5180034637451172, + "objective/rlhf_reward": -4.338680283228555, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 15.410400390625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.599609375, + "step": 632, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9943327903747559 + }, + { + "episode": 10144, + "epoch": 0.060778181208134115, + "loss/policy_avg": 0.4167541265487671, + "lr": 9.595475460122701e-06, + "objective/entropy": -154.04684448242188, + "objective/kl": 33.39550018310547, + "objective/non_score_reward": -1.6697750091552734, + "objective/rlhf_reward": -5.074980471197682, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 53.406578063964844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.576171875, + "step": 633, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9963878393173218 + }, + { + "episode": 10160, + "epoch": 0.060874045847263664, + "loss/policy_avg": -0.021846026182174683, + "lr": 9.594836400817997e-06, + "objective/entropy": -22.81509780883789, + "objective/kl": 23.709880828857422, + "objective/non_score_reward": -1.1854941844940186, + "objective/rlhf_reward": -2.917147810730051, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 0.839837908744812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6953125, + "step": 634, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000304937362671 + }, + { + "episode": 10176, + "epoch": 0.06096991048639321, + "loss/policy_avg": 0.014755940064787865, + "lr": 9.594197341513293e-06, + "objective/entropy": -198.07839965820312, + "objective/kl": 21.79191017150879, + "objective/non_score_reward": -1.0895954370498657, + "objective/rlhf_reward": -1.9583818078041078, + "objective/scores": 0.6, + "policy/approxkl_avg": 0.6484163999557495, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.767578125, + "step": 635, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0002074241638184 + }, + { + "episode": 10192, + "epoch": 0.06106577512552276, + "loss/policy_avg": 0.13533297181129456, + "lr": 9.593558282208589e-06, + "objective/entropy": -201.26246643066406, + "objective/kl": 26.135250091552734, + "objective/non_score_reward": -1.3067626953125, + "objective/rlhf_reward": -3.885414889364868, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 11.92165756225586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.740234375, + "step": 636, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9993889331817627 + }, + { + "episode": 10208, + "epoch": 0.06116163976465231, + "loss/policy_avg": 0.4021642506122589, + "lr": 9.592919222903886e-06, + "objective/entropy": -286.0339050292969, + "objective/kl": 14.542181968688965, + "objective/non_score_reward": -0.7271090745925903, + "objective/rlhf_reward": -1.484604258735744, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 5.031335353851318, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.701171875, + "step": 637, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.003591775894165 + }, + { + "episode": 10224, + "epoch": 0.06125750440378186, + "loss/policy_avg": 0.2514651417732239, + "lr": 9.592280163599182e-06, + "objective/entropy": -132.75355529785156, + "objective/kl": 25.25128173828125, + "objective/non_score_reward": -1.2625641822814941, + "objective/rlhf_reward": -3.5996581717446894, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 14.74315071105957, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.712890625, + "step": 638, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000126361846924 + }, + { + "episode": 10240, + "epoch": 0.06135336904291141, + "loss/policy_avg": 0.012995198369026184, + "lr": 9.59164110429448e-06, + "objective/entropy": -181.2290496826172, + "objective/kl": 22.253154754638672, + "objective/non_score_reward": -1.1126577854156494, + "objective/rlhf_reward": -3.026798923214046, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 0.9591898918151855, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 639, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993176460266113 + }, + { + "episode": 10256, + "epoch": 0.06144923368204096, + "loss/policy_avg": 0.15271592140197754, + "lr": 9.591002044989776e-06, + "objective/entropy": -105.57412719726562, + "objective/kl": 38.59171676635742, + "objective/non_score_reward": -1.9295859336853027, + "objective/rlhf_reward": -6.16208431026037, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 6.626259803771973, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.734375, + "step": 640, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996435284614563 + }, + { + "episode": 10272, + "epoch": 0.061545098321170506, + "loss/policy_avg": -0.11524446308612823, + "lr": 9.590362985685071e-06, + "objective/entropy": -123.53447723388672, + "objective/kl": 26.7266845703125, + "objective/non_score_reward": -1.336334228515625, + "objective/rlhf_reward": -3.222630920187507, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.8472533226013184, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46484375, + "step": 641, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.011606216430664 + }, + { + "episode": 10288, + "epoch": 0.061640962960300055, + "loss/policy_avg": 0.4013972282409668, + "lr": 9.589723926380368e-06, + "objective/entropy": -128.90103149414062, + "objective/kl": 31.007064819335938, + "objective/non_score_reward": -1.5503532886505127, + "objective/rlhf_reward": -4.685641431602177, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 6.671117782592773, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.556640625, + "step": 642, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9970167875289917 + }, + { + "episode": 10304, + "epoch": 0.061736827599429604, + "loss/policy_avg": 0.7907944321632385, + "lr": 9.589084867075665e-06, + "objective/entropy": -58.220497131347656, + "objective/kl": 41.770606994628906, + "objective/non_score_reward": -2.0885305404663086, + "objective/rlhf_reward": -6.620788232485452, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 17.74094581604004, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.466796875, + "step": 643, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.995949149131775 + }, + { + "episode": 10320, + "epoch": 0.06183269223855915, + "loss/policy_avg": 0.017528323456645012, + "lr": 9.588445807770962e-06, + "objective/entropy": -208.79119873046875, + "objective/kl": 23.041034698486328, + "objective/non_score_reward": -1.1520518064498901, + "objective/rlhf_reward": -3.092435383590397, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 1.83624267578125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.732421875, + "step": 644, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0010547637939453 + }, + { + "episode": 10336, + "epoch": 0.0619285568776887, + "loss/policy_avg": 0.15500307083129883, + "lr": 9.587806748466259e-06, + "objective/entropy": -124.78570556640625, + "objective/kl": 34.243202209472656, + "objective/non_score_reward": -1.7121602296829224, + "objective/rlhf_reward": -3.92492190444586, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.4558181762695312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5078125, + "step": 645, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997308254241943 + }, + { + "episode": 10352, + "epoch": 0.06202442151681825, + "loss/policy_avg": 0.2161247283220291, + "lr": 9.587167689161556e-06, + "objective/entropy": -163.63064575195312, + "objective/kl": 25.873336791992188, + "objective/non_score_reward": -1.293666958808899, + "objective/rlhf_reward": -3.7960657263673365, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 8.89102840423584, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5234375, + "step": 646, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998837947845459 + }, + { + "episode": 10368, + "epoch": 0.0621202861559478, + "loss/policy_avg": 0.08966261148452759, + "lr": 9.586528629856851e-06, + "objective/entropy": -104.2444076538086, + "objective/kl": 33.29509735107422, + "objective/non_score_reward": -1.664754867553711, + "objective/rlhf_reward": -4.925686256090799, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.3677499294281006, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 647, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993959665298462 + }, + { + "episode": 10384, + "epoch": 0.06221615079507735, + "loss/policy_avg": -0.02724701538681984, + "lr": 9.585889570552148e-06, + "objective/entropy": -133.99429321289062, + "objective/kl": 27.543067932128906, + "objective/non_score_reward": -1.3771533966064453, + "objective/rlhf_reward": -3.1086136460304257, + "objective/scores": 0.6, + "policy/approxkl_avg": 7.215035438537598, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.677734375, + "step": 648, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986003637313843 + }, + { + "episode": 10400, + "epoch": 0.0623120154342069, + "loss/policy_avg": -0.23539991676807404, + "lr": 9.585250511247445e-06, + "objective/entropy": -167.906494140625, + "objective/kl": 25.879772186279297, + "objective/non_score_reward": -1.293988585472107, + "objective/rlhf_reward": -3.571834478441792, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.0954341888427734, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.57421875, + "step": 649, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9997683763504028 + }, + { + "episode": 10416, + "epoch": 0.06240788007333645, + "loss/policy_avg": 0.30569222569465637, + "lr": 9.584611451942742e-06, + "objective/entropy": -226.60678100585938, + "objective/kl": 28.675113677978516, + "objective/non_score_reward": -1.433755874633789, + "objective/rlhf_reward": -3.7876121503877, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 52.77922058105469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6484375, + "step": 650, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9980084896087646 + }, + { + "episode": 10432, + "epoch": 0.062503744712466, + "loss/policy_avg": -0.24214023351669312, + "lr": 9.583972392638038e-06, + "objective/entropy": -121.17498779296875, + "objective/kl": 38.84062957763672, + "objective/non_score_reward": -1.9420316219329834, + "objective/rlhf_reward": -5.820715139584477, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.8967432975769043, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.623046875, + "step": 651, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0004329681396484 + }, + { + "episode": 10448, + "epoch": 0.06259960935159554, + "loss/policy_avg": -0.3156575858592987, + "lr": 9.583333333333335e-06, + "objective/entropy": -146.38143920898438, + "objective/kl": 32.020687103271484, + "objective/non_score_reward": -1.60103440284729, + "objective/rlhf_reward": -5.062502017527251, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 2.199296236038208, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.642578125, + "step": 652, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0025553703308105 + }, + { + "episode": 10464, + "epoch": 0.0626954739907251, + "loss/policy_avg": 0.07271748781204224, + "lr": 9.58269427402863e-06, + "objective/entropy": -196.48562622070312, + "objective/kl": 28.001068115234375, + "objective/non_score_reward": -1.4000535011291504, + "objective/rlhf_reward": -4.2002141833305355, + "objective/scores": 0.35, + "policy/approxkl_avg": 24.475753784179688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 653, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0010745525360107 + }, + { + "episode": 10480, + "epoch": 0.06279133862985464, + "loss/policy_avg": 0.17373695969581604, + "lr": 9.582055214723927e-06, + "objective/entropy": -275.5335388183594, + "objective/kl": 27.79926300048828, + "objective/non_score_reward": -1.3899632692337036, + "objective/rlhf_reward": -5.5598530769348145, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.22200584411621, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.615234375, + "step": 654, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987027645111084 + }, + { + "episode": 10496, + "epoch": 0.0628872032689842, + "loss/policy_avg": 0.15186084806919098, + "lr": 9.581416155419224e-06, + "objective/entropy": -197.2568817138672, + "objective/kl": 23.105377197265625, + "objective/non_score_reward": -1.1552690267562866, + "objective/rlhf_reward": -2.796247239383768, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 35.64599609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7734375, + "step": 655, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9985601902008057 + }, + { + "episode": 10512, + "epoch": 0.06298306790811374, + "loss/policy_avg": 0.09821736067533493, + "lr": 9.58077709611452e-06, + "objective/entropy": -192.20767211914062, + "objective/kl": 28.659635543823242, + "objective/non_score_reward": -1.4329817295074463, + "objective/rlhf_reward": -4.070067649305449, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 5.6847333908081055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.712890625, + "step": 656, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977080821990967 + }, + { + "episode": 10528, + "epoch": 0.0630789325472433, + "loss/policy_avg": 0.24115119874477386, + "lr": 9.580138036809816e-06, + "objective/entropy": -171.08619689941406, + "objective/kl": 26.453920364379883, + "objective/non_score_reward": -1.3226962089538574, + "objective/rlhf_reward": -3.8907844781875607, + "objective/scores": 0.35, + "policy/approxkl_avg": 11.276920318603516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6875, + "step": 657, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999925136566162 + }, + { + "episode": 10544, + "epoch": 0.06317479718637284, + "loss/policy_avg": -0.04878993332386017, + "lr": 9.579498977505113e-06, + "objective/entropy": -95.69158172607422, + "objective/kl": 26.445575714111328, + "objective/non_score_reward": -1.3222787380218506, + "objective/rlhf_reward": -3.94747917941156, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 5.285589218139648, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.625, + "step": 658, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0013298988342285 + }, + { + "episode": 10560, + "epoch": 0.0632706618255024, + "loss/policy_avg": -0.10105658322572708, + "lr": 9.57885991820041e-06, + "objective/entropy": -209.01065063476562, + "objective/kl": 27.234224319458008, + "objective/non_score_reward": -1.3617112636566162, + "objective/rlhf_reward": -4.046844816207885, + "objective/scores": 0.35, + "policy/approxkl_avg": 2.436962366104126, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6328125, + "step": 659, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001246690750122 + }, + { + "episode": 10576, + "epoch": 0.06336652646463194, + "loss/policy_avg": -0.3218346834182739, + "lr": 9.578220858895705e-06, + "objective/entropy": -3.9748001098632812, + "objective/kl": 18.186880111694336, + "objective/non_score_reward": -0.9093440771102905, + "objective/rlhf_reward": -1.5146698824324945, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 28.07345962524414, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8359375, + "step": 660, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002238750457764 + }, + { + "episode": 10592, + "epoch": 0.06346239110376149, + "loss/policy_avg": -0.19762462377548218, + "lr": 9.577581799591002e-06, + "objective/entropy": -204.72760009765625, + "objective/kl": 18.785112380981445, + "objective/non_score_reward": -0.9392555356025696, + "objective/rlhf_reward": -1.6343160293259955, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.8940598964691162, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.560546875, + "step": 661, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0025925636291504 + }, + { + "episode": 10608, + "epoch": 0.06355825574289103, + "loss/policy_avg": -0.45743584632873535, + "lr": 9.576942740286299e-06, + "objective/entropy": -134.4844970703125, + "objective/kl": 33.7373046875, + "objective/non_score_reward": -1.6868653297424316, + "objective/rlhf_reward": -5.296863298030242, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 2.153486967086792, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.521484375, + "step": 662, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00048828125 + }, + { + "episode": 10624, + "epoch": 0.06365412038202059, + "loss/policy_avg": 0.2565079629421234, + "lr": 9.576303680981596e-06, + "objective/entropy": -180.13528442382812, + "objective/kl": 17.24534034729004, + "objective/non_score_reward": -0.862267017364502, + "objective/rlhf_reward": -2.089818143580837, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 7.433453559875488, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.626953125, + "step": 663, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994475841522217 + }, + { + "episode": 10640, + "epoch": 0.06374998502115013, + "loss/policy_avg": 0.17452527582645416, + "lr": 9.575664621676893e-06, + "objective/entropy": -64.2728271484375, + "objective/kl": 21.405649185180664, + "objective/non_score_reward": -1.0702824592590332, + "objective/rlhf_reward": -2.9218800303682517, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.6351606845855713, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.71484375, + "step": 664, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0012965202331543 + }, + { + "episode": 10656, + "epoch": 0.06384584966027969, + "loss/policy_avg": 0.6966801881790161, + "lr": 9.57502556237219e-06, + "objective/entropy": -251.04238891601562, + "objective/kl": 27.693851470947266, + "objective/non_score_reward": -1.384692668914795, + "objective/rlhf_reward": -3.934650454584675, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 10.390886306762695, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.5859375, + "step": 665, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001401662826538 + }, + { + "episode": 10672, + "epoch": 0.06394171429940923, + "loss/policy_avg": 0.16458481550216675, + "lr": 9.574386503067485e-06, + "objective/entropy": -219.99136352539062, + "objective/kl": 13.308931350708008, + "objective/non_score_reward": -0.6654465198516846, + "objective/rlhf_reward": -0.7143749100732166, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 3.77976131439209, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69140625, + "step": 666, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000656843185425 + }, + { + "episode": 10688, + "epoch": 0.06403757893853879, + "loss/policy_avg": -0.009436726570129395, + "lr": 9.573747443762782e-06, + "objective/entropy": -162.25047302246094, + "objective/kl": 23.977962493896484, + "objective/non_score_reward": -1.1988980770111084, + "objective/rlhf_reward": -2.8481810791062667, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 22.450942993164062, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.763671875, + "step": 667, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0019123554229736 + }, + { + "episode": 10704, + "epoch": 0.06413344357766833, + "loss/policy_avg": 0.4135128855705261, + "lr": 9.573108384458079e-06, + "objective/entropy": -63.0797119140625, + "objective/kl": 41.37904739379883, + "objective/non_score_reward": -2.0689523220062256, + "objective/rlhf_reward": -6.542475895086923, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 88.98745727539062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.779296875, + "step": 668, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999923706054688 + }, + { + "episode": 10720, + "epoch": 0.06422930821679788, + "loss/policy_avg": 0.6821532845497131, + "lr": 9.572469325153375e-06, + "objective/entropy": -196.7287139892578, + "objective/kl": 30.88260269165039, + "objective/non_score_reward": -1.5441300868988037, + "objective/rlhf_reward": -4.660748505386051, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 23.963293075561523, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.78515625, + "step": 669, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989004135131836 + }, + { + "episode": 10736, + "epoch": 0.06432517285592743, + "loss/policy_avg": 0.3629915118217468, + "lr": 9.571830265848672e-06, + "objective/entropy": -205.541259765625, + "objective/kl": 24.442432403564453, + "objective/non_score_reward": -1.2221217155456543, + "objective/rlhf_reward": -3.155153171221415, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 15.010305404663086, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.63671875, + "step": 670, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991261959075928 + }, + { + "episode": 10752, + "epoch": 0.06442103749505698, + "loss/policy_avg": 0.3024546504020691, + "lr": 9.571191206543968e-06, + "objective/entropy": -184.0182647705078, + "objective/kl": 28.46197509765625, + "objective/non_score_reward": -1.4230988025665283, + "objective/rlhf_reward": -3.744983862118657, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.1509013175964355, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 671, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998319149017334 + }, + { + "episode": 10768, + "epoch": 0.06451690213418652, + "loss/policy_avg": -0.12359270453453064, + "lr": 9.570552147239264e-06, + "objective/entropy": -107.1251220703125, + "objective/kl": 24.85216522216797, + "objective/non_score_reward": -1.2426085472106934, + "objective/rlhf_reward": -3.611183964942379, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 2.815180540084839, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.62890625, + "step": 672, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002232074737549 + }, + { + "episode": 10784, + "epoch": 0.06461276677331608, + "loss/policy_avg": 0.3783743977546692, + "lr": 9.569913087934561e-06, + "objective/entropy": -155.0634765625, + "objective/kl": 33.26643371582031, + "objective/non_score_reward": -1.663321852684021, + "objective/rlhf_reward": -5.294037544463558, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 7.487679958343506, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.751953125, + "step": 673, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9973864555358887 + }, + { + "episode": 10800, + "epoch": 0.06470863141244562, + "loss/policy_avg": 0.12491178512573242, + "lr": 9.569274028629858e-06, + "objective/entropy": -202.8880157470703, + "objective/kl": 23.53227996826172, + "objective/non_score_reward": -1.1766140460968018, + "objective/rlhf_reward": -2.9731229106585184, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 5.709697246551514, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.576171875, + "step": 674, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979543685913086 + }, + { + "episode": 10816, + "epoch": 0.06480449605157518, + "loss/policy_avg": -0.01751142367720604, + "lr": 9.568634969325155e-06, + "objective/entropy": -217.27896118164062, + "objective/kl": 27.020957946777344, + "objective/non_score_reward": -1.3510478734970093, + "objective/rlhf_reward": -3.4567805034684493, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 0.6378078460693359, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.63671875, + "step": 675, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0012476444244385 + }, + { + "episode": 10832, + "epoch": 0.06490036069070472, + "loss/policy_avg": 0.28126630187034607, + "lr": 9.567995910020452e-06, + "objective/entropy": -230.15963745117188, + "objective/kl": 24.95879364013672, + "objective/non_score_reward": -1.2479398250579834, + "objective/rlhf_reward": -3.329899912298308, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 10.301782608032227, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.744140625, + "step": 676, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993771314620972 + }, + { + "episode": 10848, + "epoch": 0.06499622532983428, + "loss/policy_avg": 0.12287623435258865, + "lr": 9.567356850715747e-06, + "objective/entropy": -263.37542724609375, + "objective/kl": 23.937744140625, + "objective/non_score_reward": -1.1968872547149658, + "objective/rlhf_reward": -0.3875493764877316, + "objective/scores": 1.1, + "policy/approxkl_avg": 45.05952453613281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.703125, + "step": 677, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9995876550674438 + }, + { + "episode": 10864, + "epoch": 0.06509208996896382, + "loss/policy_avg": 0.6470179557800293, + "lr": 9.566717791411044e-06, + "objective/entropy": -65.45881652832031, + "objective/kl": 23.807559967041016, + "objective/non_score_reward": -1.190378189086914, + "objective/rlhf_reward": -3.419877028375297, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 10.65350341796875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.744140625, + "step": 678, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999696969985962 + }, + { + "episode": 10880, + "epoch": 0.06518795460809337, + "loss/policy_avg": 0.2790781855583191, + "lr": 9.56607873210634e-06, + "objective/entropy": -161.4605712890625, + "objective/kl": 41.620460510253906, + "objective/non_score_reward": -2.0810232162475586, + "objective/rlhf_reward": -3.924092388153076, + "objective/scores": 1.1, + "policy/approxkl_avg": 5.482306480407715, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.57421875, + "step": 679, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986392259597778 + }, + { + "episode": 10896, + "epoch": 0.06528381924722292, + "loss/policy_avg": 0.042992569506168365, + "lr": 9.565439672801636e-06, + "objective/entropy": -162.92010498046875, + "objective/kl": 26.902143478393555, + "objective/non_score_reward": -1.3451071977615356, + "objective/rlhf_reward": -4.001826503363949, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 4.27599573135376, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.654296875, + "step": 680, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998090147972107 + }, + { + "episode": 10912, + "epoch": 0.06537968388635247, + "loss/policy_avg": 0.20157073438167572, + "lr": 9.564800613496933e-06, + "objective/entropy": -265.3901672363281, + "objective/kl": 29.956632614135742, + "objective/non_score_reward": -1.4978315830230713, + "objective/rlhf_reward": -3.868620397821937, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 68.22042846679688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.609375, + "step": 681, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982101917266846 + }, + { + "episode": 10928, + "epoch": 0.06547554852548201, + "loss/policy_avg": 1.519484281539917, + "lr": 9.56416155419223e-06, + "objective/entropy": -127.62720489501953, + "objective/kl": 23.382505416870117, + "objective/non_score_reward": -1.1691253185272217, + "objective/rlhf_reward": -2.2765009164810177, + "objective/scores": 0.6, + "policy/approxkl_avg": 17.878856658935547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.419921875, + "step": 682, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984099864959717 + }, + { + "episode": 10944, + "epoch": 0.06557141316461157, + "loss/policy_avg": 0.3158057928085327, + "lr": 9.563522494887527e-06, + "objective/entropy": -190.45260620117188, + "objective/kl": 25.518230438232422, + "objective/non_score_reward": -1.275911569595337, + "objective/rlhf_reward": -3.622693660672068, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 34.12330627441406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.63671875, + "step": 683, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000800371170044 + }, + { + "episode": 10960, + "epoch": 0.06566727780374111, + "loss/policy_avg": 1.1294161081314087, + "lr": 9.562883435582822e-06, + "objective/entropy": -107.20721435546875, + "objective/kl": 32.379913330078125, + "objective/non_score_reward": -1.6189957857131958, + "objective/rlhf_reward": -5.13434737017694, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 7.272080421447754, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5234375, + "step": 684, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998885154724121 + }, + { + "episode": 10976, + "epoch": 0.06576314244287067, + "loss/policy_avg": 0.44281357526779175, + "lr": 9.562244376278119e-06, + "objective/entropy": -128.0640869140625, + "objective/kl": 20.03044891357422, + "objective/non_score_reward": -1.0015225410461426, + "objective/rlhf_reward": -1.082371120096418, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 12.73418140411377, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.634765625, + "step": 685, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999821424484253 + }, + { + "episode": 10992, + "epoch": 0.06585900708200021, + "loss/policy_avg": 0.2683737576007843, + "lr": 9.561605316973416e-06, + "objective/entropy": -258.8201904296875, + "objective/kl": 27.295347213745117, + "objective/non_score_reward": -1.3647674322128296, + "objective/rlhf_reward": -2.535350595356199, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.86362886428833, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71875, + "step": 686, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9981340169906616 + }, + { + "episode": 11008, + "epoch": 0.06595487172112977, + "loss/policy_avg": -0.14624132215976715, + "lr": 9.560966257668713e-06, + "objective/entropy": -96.99462890625, + "objective/kl": 30.466350555419922, + "objective/non_score_reward": -1.523317575454712, + "objective/rlhf_reward": -4.57749816158646, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 8.779112815856934, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.494140625, + "step": 687, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9981212615966797 + }, + { + "episode": 11024, + "epoch": 0.06605073636025931, + "loss/policy_avg": 0.12842759490013123, + "lr": 9.56032719836401e-06, + "objective/entropy": -166.20689392089844, + "objective/kl": 26.250516891479492, + "objective/non_score_reward": -1.312525749206543, + "objective/rlhf_reward": -2.8501029968261715, + "objective/scores": 0.6, + "policy/approxkl_avg": 7.160890102386475, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5703125, + "step": 688, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9990025758743286 + }, + { + "episode": 11040, + "epoch": 0.06614660099938886, + "loss/policy_avg": 0.2923339009284973, + "lr": 9.559688139059306e-06, + "objective/entropy": -236.72100830078125, + "objective/kl": 33.81795883178711, + "objective/non_score_reward": -1.6908979415893555, + "objective/rlhf_reward": -5.4219562321001575, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 16.3193359375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.650390625, + "step": 689, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.99735426902771 + }, + { + "episode": 11056, + "epoch": 0.0662424656385184, + "loss/policy_avg": -0.10266150534152985, + "lr": 9.559049079754601e-06, + "objective/entropy": -85.62126159667969, + "objective/kl": 31.331233978271484, + "objective/non_score_reward": -1.5665616989135742, + "objective/rlhf_reward": -4.143540324942146, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 6.518294811248779, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.794921875, + "step": 690, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.006378650665283 + }, + { + "episode": 11072, + "epoch": 0.06633833027764796, + "loss/policy_avg": 0.17208513617515564, + "lr": 9.558410020449898e-06, + "objective/entropy": -175.00662231445312, + "objective/kl": 33.992698669433594, + "objective/non_score_reward": -1.6996350288391113, + "objective/rlhf_reward": -5.4392902490839194, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 32.03794860839844, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.716796875, + "step": 691, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989235401153564 + }, + { + "episode": 11088, + "epoch": 0.06643419491677752, + "loss/policy_avg": 0.01335047371685505, + "lr": 9.557770961145195e-06, + "objective/entropy": -248.65049743652344, + "objective/kl": 22.41885757446289, + "objective/non_score_reward": -1.1209429502487183, + "objective/rlhf_reward": -2.536360512452062, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.7352328300476074, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.673828125, + "step": 692, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0011818408966064 + }, + { + "episode": 11104, + "epoch": 0.06653005955590706, + "loss/policy_avg": 0.14417897164821625, + "lr": 9.557131901840492e-06, + "objective/entropy": -218.454345703125, + "objective/kl": 15.86509895324707, + "objective/non_score_reward": -0.7932549715042114, + "objective/rlhf_reward": 1.226980143785477, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.0328912734985352, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.640625, + "step": 693, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0003151893615723 + }, + { + "episode": 11120, + "epoch": 0.06662592419503661, + "loss/policy_avg": 0.09597369283437729, + "lr": 9.556492842535789e-06, + "objective/entropy": -175.68487548828125, + "objective/kl": 32.48929977416992, + "objective/non_score_reward": -1.624464988708496, + "objective/rlhf_reward": -2.0978601336479183, + "objective/scores": 1.1, + "policy/approxkl_avg": 3.689056396484375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.705078125, + "step": 694, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998678207397461 + }, + { + "episode": 11136, + "epoch": 0.06672178883416616, + "loss/policy_avg": -0.004386359825730324, + "lr": 9.555853783231084e-06, + "objective/entropy": 122.54474639892578, + "objective/kl": 42.134315490722656, + "objective/non_score_reward": -2.106715679168701, + "objective/rlhf_reward": -6.822743091646748, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 6.307683944702148, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7421875, + "step": 695, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999419450759888 + }, + { + "episode": 11152, + "epoch": 0.06681765347329571, + "loss/policy_avg": 0.3615373373031616, + "lr": 9.555214723926381e-06, + "objective/entropy": -260.84075927734375, + "objective/kl": 35.725467681884766, + "objective/non_score_reward": -1.7862732410430908, + "objective/rlhf_reward": -5.664140108044505, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 45.438873291015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.685546875, + "step": 696, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.996116280555725 + }, + { + "episode": 11168, + "epoch": 0.06691351811242525, + "loss/policy_avg": 0.24602335691452026, + "lr": 9.554575664621678e-06, + "objective/entropy": -71.92741394042969, + "objective/kl": 30.083784103393555, + "objective/non_score_reward": -1.5041892528533936, + "objective/rlhf_reward": -4.6575073835596275, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 5.438946723937988, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4765625, + "step": 697, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998178482055664 + }, + { + "episode": 11184, + "epoch": 0.06700938275155481, + "loss/policy_avg": 0.034039177000522614, + "lr": 9.553936605316975e-06, + "objective/entropy": -198.67774963378906, + "objective/kl": 23.375925064086914, + "objective/non_score_reward": -1.1687963008880615, + "objective/rlhf_reward": -1.7514660700571265, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 0.5530495643615723, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.53125, + "step": 698, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00034236907959 + }, + { + "episode": 11200, + "epoch": 0.06710524739068435, + "loss/policy_avg": 0.5306535959243774, + "lr": 9.553297546012272e-06, + "objective/entropy": -143.43771362304688, + "objective/kl": 35.411888122558594, + "objective/non_score_reward": -1.7705943584442139, + "objective/rlhf_reward": -5.63177965125595, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 6.416120529174805, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.66796875, + "step": 699, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994237422943115 + }, + { + "episode": 11216, + "epoch": 0.06720111202981391, + "loss/policy_avg": 0.2092888504266739, + "lr": 9.552658486707569e-06, + "objective/entropy": -169.036376953125, + "objective/kl": 30.64543914794922, + "objective/non_score_reward": -1.5322721004486084, + "objective/rlhf_reward": -1.7290880441665646, + "objective/scores": 1.1, + "policy/approxkl_avg": 132.6121063232422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.564453125, + "step": 700, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992992877960205 + }, + { + "episode": 11232, + "epoch": 0.06729697666894345, + "loss/policy_avg": 0.2553282380104065, + "lr": 9.552019427402864e-06, + "objective/entropy": -145.8370361328125, + "objective/kl": 31.58509063720703, + "objective/non_score_reward": -1.5792546272277832, + "objective/rlhf_reward": -4.760759084430292, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 23.342622756958008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.712890625, + "step": 701, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0007388591766357 + }, + { + "episode": 11248, + "epoch": 0.067392841308073, + "loss/policy_avg": 0.1272473782300949, + "lr": 9.55138036809816e-06, + "objective/entropy": -283.0919494628906, + "objective/kl": 18.825233459472656, + "objective/non_score_reward": -0.9412617683410645, + "objective/rlhf_reward": -2.4057970878824424, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.5947492122650146, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 702, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999638557434082 + }, + { + "episode": 11264, + "epoch": 0.06748870594720255, + "loss/policy_avg": 0.2034430205821991, + "lr": 9.550741308793456e-06, + "objective/entropy": -274.40478515625, + "objective/kl": 20.724695205688477, + "objective/non_score_reward": -1.0362348556518555, + "objective/rlhf_reward": -1.221220110298368, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 3.738941192626953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.61328125, + "step": 703, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997761607170105 + }, + { + "episode": 11280, + "epoch": 0.0675845705863321, + "loss/policy_avg": 0.7114033699035645, + "lr": 9.550102249488753e-06, + "objective/entropy": -135.6627960205078, + "objective/kl": 27.718311309814453, + "objective/non_score_reward": -1.3859155178070068, + "objective/rlhf_reward": -3.5962508422898605, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 32.94233703613281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.64453125, + "step": 704, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9985225200653076 + }, + { + "episode": 11296, + "epoch": 0.06768043522546165, + "loss/policy_avg": -0.08856553584337234, + "lr": 9.54946319018405e-06, + "objective/entropy": -172.419921875, + "objective/kl": 31.078826904296875, + "objective/non_score_reward": -1.553941249847412, + "objective/rlhf_reward": -4.765167097659454, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 27.00151824951172, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.76171875, + "step": 705, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.003095865249634 + }, + { + "episode": 11312, + "epoch": 0.0677762998645912, + "loss/policy_avg": -0.1016867533326149, + "lr": 9.548824130879346e-06, + "objective/entropy": -186.52476501464844, + "objective/kl": 30.371601104736328, + "objective/non_score_reward": -1.5185801982879639, + "objective/rlhf_reward": -4.593368175442576, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 7.805020332336426, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 706, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0018138885498047 + }, + { + "episode": 11328, + "epoch": 0.06787216450372074, + "loss/policy_avg": 0.3950710892677307, + "lr": 9.548185071574643e-06, + "objective/entropy": -169.30099487304688, + "objective/kl": 26.604206085205078, + "objective/non_score_reward": -1.3302103281021118, + "objective/rlhf_reward": -3.9422390843308985, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 3.9309802055358887, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.650390625, + "step": 707, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0003857612609863 + }, + { + "episode": 11344, + "epoch": 0.0679680291428503, + "loss/policy_avg": 0.15957045555114746, + "lr": 9.547546012269938e-06, + "objective/entropy": -152.48211669921875, + "objective/kl": 28.93355941772461, + "objective/non_score_reward": -1.4466780424118042, + "objective/rlhf_reward": -4.124852543295012, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 30.355663299560547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.728515625, + "step": 708, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9971683025360107 + }, + { + "episode": 11360, + "epoch": 0.06806389378197984, + "loss/policy_avg": 0.1635814905166626, + "lr": 9.546906952965235e-06, + "objective/entropy": -225.05284118652344, + "objective/kl": 32.07009506225586, + "objective/non_score_reward": -1.6035047769546509, + "objective/rlhf_reward": -5.088506314784212, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 25.63396453857422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.708984375, + "step": 709, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9975805282592773 + }, + { + "episode": 11376, + "epoch": 0.0681597584211094, + "loss/policy_avg": 0.22918304800987244, + "lr": 9.546267893660532e-06, + "objective/entropy": -245.11099243164062, + "objective/kl": 31.21074867248535, + "objective/non_score_reward": -1.560537576675415, + "objective/rlhf_reward": -4.5802904419308765, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 14.6522216796875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.658203125, + "step": 710, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971773624420166 + }, + { + "episode": 11392, + "epoch": 0.06825562306023894, + "loss/policy_avg": -0.15267148613929749, + "lr": 9.545628834355829e-06, + "objective/entropy": -26.006134033203125, + "objective/kl": 25.76430320739746, + "objective/non_score_reward": -1.288215160369873, + "objective/rlhf_reward": -3.2054496509599044, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.9515511989593506, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.75, + "step": 711, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0021400451660156 + }, + { + "episode": 11408, + "epoch": 0.0683514876993685, + "loss/policy_avg": 0.03201477974653244, + "lr": 9.544989775051126e-06, + "objective/entropy": -229.9574737548828, + "objective/kl": 31.691633224487305, + "objective/non_score_reward": -1.5845816135406494, + "objective/rlhf_reward": -4.887728492827758, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 81.25225830078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.74609375, + "step": 712, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0012998580932617 + }, + { + "episode": 11424, + "epoch": 0.06844735233849804, + "loss/policy_avg": 0.5598920583724976, + "lr": 9.544350715746423e-06, + "objective/entropy": -198.39407348632812, + "objective/kl": 22.02547264099121, + "objective/non_score_reward": -1.1012736558914185, + "objective/rlhf_reward": -3.045844846700115, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 7.494403839111328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6640625, + "step": 713, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001235008239746 + }, + { + "episode": 11440, + "epoch": 0.0685432169776276, + "loss/policy_avg": 0.14270013570785522, + "lr": 9.543711656441718e-06, + "objective/entropy": -281.67730712890625, + "objective/kl": 30.167518615722656, + "objective/non_score_reward": -1.5083760023117065, + "objective/rlhf_reward": -4.517732465060886, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 42.272212982177734, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6171875, + "step": 714, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9981521368026733 + }, + { + "episode": 11456, + "epoch": 0.06863908161675714, + "loss/policy_avg": 0.23854002356529236, + "lr": 9.543072597137015e-06, + "objective/entropy": -205.70501708984375, + "objective/kl": 26.037616729736328, + "objective/non_score_reward": -1.3018807172775269, + "objective/rlhf_reward": -3.603402886454182, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 21.1671085357666, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.736328125, + "step": 715, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999593734741211 + }, + { + "episode": 11472, + "epoch": 0.06873494625588669, + "loss/policy_avg": 0.25810641050338745, + "lr": 9.542433537832312e-06, + "objective/entropy": -202.4583740234375, + "objective/kl": 26.777297973632812, + "objective/non_score_reward": -1.338865041732788, + "objective/rlhf_reward": -3.7513400054612926, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 4.448478698730469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7890625, + "step": 716, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999415636062622 + }, + { + "episode": 11488, + "epoch": 0.06883081089501623, + "loss/policy_avg": 0.16866181790828705, + "lr": 9.541794478527609e-06, + "objective/entropy": -174.37855529785156, + "objective/kl": 34.941444396972656, + "objective/non_score_reward": -1.7470722198486328, + "objective/rlhf_reward": -5.43202957412298, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 0.9149700403213501, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69140625, + "step": 717, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000814437866211 + }, + { + "episode": 11504, + "epoch": 0.06892667553414579, + "loss/policy_avg": 0.20718123018741608, + "lr": 9.541155419222906e-06, + "objective/entropy": -75.93595123291016, + "objective/kl": 37.52787780761719, + "objective/non_score_reward": -1.8763937950134277, + "objective/rlhf_reward": -6.024622860367655, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 3.859286308288574, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859375, + "step": 718, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990894794464111 + }, + { + "episode": 11520, + "epoch": 0.06902254017327533, + "loss/policy_avg": -0.14078834652900696, + "lr": 9.5405163599182e-06, + "objective/entropy": -111.06301879882812, + "objective/kl": 37.833980560302734, + "objective/non_score_reward": -1.8916990756988525, + "objective/rlhf_reward": -5.44408971287397, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.0138969421386719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.646484375, + "step": 719, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0016212463378906 + }, + { + "episode": 11536, + "epoch": 0.06911840481240489, + "loss/policy_avg": -0.02326921373605728, + "lr": 9.539877300613498e-06, + "objective/entropy": -7.474525451660156, + "objective/kl": 37.21611785888672, + "objective/non_score_reward": -1.860805869102478, + "objective/rlhf_reward": -7.443223357200623, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.989769458770752, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5546875, + "step": 720, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0015361309051514 + }, + { + "episode": 11552, + "epoch": 0.06921426945153443, + "loss/policy_avg": 0.9960123896598816, + "lr": 9.539238241308795e-06, + "objective/entropy": -102.21640014648438, + "objective/kl": 29.624881744384766, + "objective/non_score_reward": -1.4812440872192383, + "objective/rlhf_reward": -3.9775650007294967, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.5700416564941406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.568359375, + "step": 721, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000302314758301 + }, + { + "episode": 11568, + "epoch": 0.06931013409066399, + "loss/policy_avg": -0.022494332864880562, + "lr": 9.538599182004091e-06, + "objective/entropy": -97.00556182861328, + "objective/kl": 34.23220443725586, + "objective/non_score_reward": -1.7116100788116455, + "objective/rlhf_reward": -5.520927820235414, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 6.0028605461120605, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.625, + "step": 722, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0014429092407227 + }, + { + "episode": 11584, + "epoch": 0.06940599872979353, + "loss/policy_avg": 0.2970792055130005, + "lr": 9.537960122699387e-06, + "objective/entropy": -218.43130493164062, + "objective/kl": 23.677339553833008, + "objective/non_score_reward": -1.1838669776916504, + "objective/rlhf_reward": -0.335467970371246, + "objective/scores": 1.1, + "policy/approxkl_avg": 35.85502624511719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.744140625, + "step": 723, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9957175254821777 + }, + { + "episode": 11600, + "epoch": 0.06950186336892308, + "loss/policy_avg": 0.09062906354665756, + "lr": 9.537321063394683e-06, + "objective/entropy": -145.62179565429688, + "objective/kl": 19.510597229003906, + "objective/non_score_reward": -0.9755299091339111, + "objective/rlhf_reward": -2.560484102278381, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 5.657525539398193, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.671875, + "step": 724, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002211809158325 + }, + { + "episode": 11616, + "epoch": 0.06959772800805263, + "loss/policy_avg": 0.5650205612182617, + "lr": 9.53668200408998e-06, + "objective/entropy": -189.58197021484375, + "objective/kl": 22.43151092529297, + "objective/non_score_reward": -1.1215755939483643, + "objective/rlhf_reward": -3.1446669011408384, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 16.189781188964844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.69140625, + "step": 725, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9978678226470947 + }, + { + "episode": 11632, + "epoch": 0.06969359264718218, + "loss/policy_avg": 0.10538655519485474, + "lr": 9.536042944785277e-06, + "objective/entropy": -262.17254638671875, + "objective/kl": 21.21435546875, + "objective/non_score_reward": -1.0607177019119263, + "objective/rlhf_reward": -2.1201648137727123, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 5.554556846618652, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7578125, + "step": 726, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9994568824768066 + }, + { + "episode": 11648, + "epoch": 0.06978945728631172, + "loss/policy_avg": 0.08264347910881042, + "lr": 9.535403885480572e-06, + "objective/entropy": -144.35389709472656, + "objective/kl": 23.849288940429688, + "objective/non_score_reward": -1.1924644708633423, + "objective/rlhf_reward": -3.2889052657440896, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 0.2577582895755768, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.60546875, + "step": 727, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009419918060303 + }, + { + "episode": 11664, + "epoch": 0.06988532192544128, + "loss/policy_avg": -0.11442309617996216, + "lr": 9.53476482617587e-06, + "objective/entropy": -161.91555786132812, + "objective/kl": 29.32978057861328, + "objective/non_score_reward": -1.4664889574050903, + "objective/rlhf_reward": -4.132622496287028, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 6.162350654602051, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58203125, + "step": 728, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0256590843200684 + }, + { + "episode": 11680, + "epoch": 0.06998118656457082, + "loss/policy_avg": 0.15979725122451782, + "lr": 9.534125766871166e-06, + "objective/entropy": -46.392860412597656, + "objective/kl": 34.71672058105469, + "objective/non_score_reward": -1.7358360290527344, + "objective/rlhf_reward": -5.601708403139739, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 26.208736419677734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.837890625, + "step": 729, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9947504997253418 + }, + { + "episode": 11696, + "epoch": 0.07007705120370038, + "loss/policy_avg": 0.01945001818239689, + "lr": 9.533486707566463e-06, + "objective/entropy": -199.32308959960938, + "objective/kl": 20.052722930908203, + "objective/non_score_reward": -1.002636194229126, + "objective/rlhf_reward": -2.586712677677242, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 5.049467086791992, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.623046875, + "step": 730, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999655485153198 + }, + { + "episode": 11712, + "epoch": 0.07017291584282992, + "loss/policy_avg": 0.22911685705184937, + "lr": 9.53284764826176e-06, + "objective/entropy": -199.43820190429688, + "objective/kl": 29.375852584838867, + "objective/non_score_reward": -1.4687926769256592, + "objective/rlhf_reward": -3.4751707077026364, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.4132235050201416, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.64453125, + "step": 731, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989359378814697 + }, + { + "episode": 11728, + "epoch": 0.07026878048195948, + "loss/policy_avg": 0.045667171478271484, + "lr": 9.532208588957055e-06, + "objective/entropy": -156.77005004882812, + "objective/kl": 28.574951171875, + "objective/non_score_reward": -1.4287474155426025, + "objective/rlhf_reward": -4.110870037142353, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 11.299884796142578, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.587890625, + "step": 732, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989423751831055 + }, + { + "episode": 11744, + "epoch": 0.07036464512108902, + "loss/policy_avg": -0.07621235400438309, + "lr": 9.531569529652352e-06, + "objective/entropy": -211.5927734375, + "objective/kl": 25.139881134033203, + "objective/non_score_reward": -1.2569940090179443, + "objective/rlhf_reward": -3.2031475856629124, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 2.0796079635620117, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.548828125, + "step": 733, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00140118598938 + }, + { + "episode": 11760, + "epoch": 0.07046050976021857, + "loss/policy_avg": 0.3665542006492615, + "lr": 9.530930470347649e-06, + "objective/entropy": -136.42066955566406, + "objective/kl": 28.39642333984375, + "objective/non_score_reward": -1.4198211431503296, + "objective/rlhf_reward": -5.679284453392029, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8006393909454346, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.673828125, + "step": 734, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988253116607666 + }, + { + "episode": 11776, + "epoch": 0.07055637439934812, + "loss/policy_avg": -0.16624964773654938, + "lr": 9.530291411042946e-06, + "objective/entropy": -172.16896057128906, + "objective/kl": 32.62467956542969, + "objective/non_score_reward": -1.6312339305877686, + "objective/rlhf_reward": -5.183300068884521, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 8.176142692565918, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.587890625, + "step": 735, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0014700889587402 + }, + { + "episode": 11792, + "epoch": 0.07065223903847767, + "loss/policy_avg": -0.01751716434955597, + "lr": 9.529652351738243e-06, + "objective/entropy": -244.469970703125, + "objective/kl": 21.34896469116211, + "objective/non_score_reward": -1.0674481391906738, + "objective/rlhf_reward": -1.346073900104734, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 1.2310829162597656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.73828125, + "step": 736, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0015974044799805 + }, + { + "episode": 11808, + "epoch": 0.07074810367760721, + "loss/policy_avg": -0.13727766275405884, + "lr": 9.52901329243354e-06, + "objective/entropy": -152.7752227783203, + "objective/kl": 30.841548919677734, + "objective/non_score_reward": -1.5420774221420288, + "objective/rlhf_reward": -1.7683096885681149, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.1432337760925293, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.732421875, + "step": 737, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000338315963745 + }, + { + "episode": 11824, + "epoch": 0.07084396831673677, + "loss/policy_avg": 0.24724145233631134, + "lr": 9.528374233128835e-06, + "objective/entropy": -249.35003662109375, + "objective/kl": 41.97819519042969, + "objective/non_score_reward": -2.098909854888916, + "objective/rlhf_reward": -6.945041160197601, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 14.357757568359375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7421875, + "step": 738, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984996318817139 + }, + { + "episode": 11840, + "epoch": 0.07093983295586631, + "loss/policy_avg": -0.1166142150759697, + "lr": 9.527735173824132e-06, + "objective/entropy": 16.65149688720703, + "objective/kl": 28.71587371826172, + "objective/non_score_reward": -1.4357936382293701, + "objective/rlhf_reward": -4.401539257078796, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 3.7607579231262207, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.64453125, + "step": 739, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990723133087158 + }, + { + "episode": 11856, + "epoch": 0.07103569759499587, + "loss/policy_avg": 0.035362888127565384, + "lr": 9.527096114519428e-06, + "objective/entropy": -227.2210235595703, + "objective/kl": 27.349641799926758, + "objective/non_score_reward": -1.36748206615448, + "objective/rlhf_reward": -3.865808401171284, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 9.06348705291748, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6640625, + "step": 740, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9996118545532227 + }, + { + "episode": 11872, + "epoch": 0.07113156223412541, + "loss/policy_avg": 0.31989267468452454, + "lr": 9.526457055214725e-06, + "objective/entropy": -213.7845458984375, + "objective/kl": 34.27381896972656, + "objective/non_score_reward": -1.713691234588623, + "objective/rlhf_reward": -4.732058527246986, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 26.892040252685547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.67578125, + "step": 741, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0020368099212646 + }, + { + "episode": 11888, + "epoch": 0.07122742687325496, + "loss/policy_avg": 0.18080441653728485, + "lr": 9.525817995910022e-06, + "objective/entropy": -164.34909057617188, + "objective/kl": 29.15081024169922, + "objective/non_score_reward": -1.457540512084961, + "objective/rlhf_reward": -4.379564206214294, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 19.9893798828125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.64453125, + "step": 742, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979908466339111 + }, + { + "episode": 11904, + "epoch": 0.0713232915123845, + "loss/policy_avg": 0.06947439908981323, + "lr": 9.525178936605317e-06, + "objective/entropy": -35.78013610839844, + "objective/kl": 30.88395118713379, + "objective/non_score_reward": -1.5441975593566895, + "objective/rlhf_reward": -4.620531051364496, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 9.903773307800293, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.658203125, + "step": 743, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000925064086914 + }, + { + "episode": 11920, + "epoch": 0.07141915615151406, + "loss/policy_avg": 0.4868197441101074, + "lr": 9.524539877300614e-06, + "objective/entropy": -185.67857360839844, + "objective/kl": 30.794139862060547, + "objective/non_score_reward": -1.5397069454193115, + "objective/rlhf_reward": -4.833315048247499, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 23.752399444580078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5703125, + "step": 744, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9970831871032715 + }, + { + "episode": 11936, + "epoch": 0.0715150207906436, + "loss/policy_avg": 0.4937871992588043, + "lr": 9.52390081799591e-06, + "objective/entropy": -196.15248107910156, + "objective/kl": 32.130393981933594, + "objective/non_score_reward": -1.6065199375152588, + "objective/rlhf_reward": -5.084443858175903, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 4.993836402893066, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.509765625, + "step": 745, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994457960128784 + }, + { + "episode": 11952, + "epoch": 0.07161088542977316, + "loss/policy_avg": 0.10673123598098755, + "lr": 9.523261758691206e-06, + "objective/entropy": -74.68463134765625, + "objective/kl": 34.281944274902344, + "objective/non_score_reward": -1.7140971422195435, + "objective/rlhf_reward": -3.9326697334062786, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.657389640808105, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4892578125, + "step": 746, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998197078704834 + }, + { + "episode": 11968, + "epoch": 0.0717067500689027, + "loss/policy_avg": 0.08303539454936981, + "lr": 9.522622699386503e-06, + "objective/entropy": -234.022705078125, + "objective/kl": 26.956684112548828, + "objective/non_score_reward": -1.3478342294692993, + "objective/rlhf_reward": -3.26863074518827, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 9.614282608032227, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.77734375, + "step": 747, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9977872371673584 + }, + { + "episode": 11984, + "epoch": 0.07180261470803226, + "loss/policy_avg": 0.006275704130530357, + "lr": 9.5219836400818e-06, + "objective/entropy": -179.78111267089844, + "objective/kl": 24.191059112548828, + "objective/non_score_reward": -1.2095528841018677, + "objective/rlhf_reward": -3.4789615509256553, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 3.5060572624206543, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48828125, + "step": 748, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001844882965088 + }, + { + "episode": 12000, + "epoch": 0.07189847934716181, + "loss/policy_avg": 0.05262988060712814, + "lr": 9.521344580777097e-06, + "objective/entropy": -61.52648162841797, + "objective/kl": 24.345882415771484, + "objective/non_score_reward": -1.2172942161560059, + "objective/rlhf_reward": -2.7464705727258067, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 26.343456268310547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.734375, + "step": 749, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9996674060821533 + }, + { + "episode": 12016, + "epoch": 0.07199434398629136, + "loss/policy_avg": 0.1489763706922531, + "lr": 9.520705521472394e-06, + "objective/entropy": -179.14523315429688, + "objective/kl": 25.692440032958984, + "objective/non_score_reward": -1.284622073173523, + "objective/rlhf_reward": -3.19107700415128, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.4589556455612183, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.517578125, + "step": 750, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989516735076904 + }, + { + "episode": 12032, + "epoch": 0.07209020862542091, + "loss/policy_avg": 0.06708867847919464, + "lr": 9.520066462167689e-06, + "objective/entropy": -56.47541427612305, + "objective/kl": 42.95630645751953, + "objective/non_score_reward": -2.147815227508545, + "objective/rlhf_reward": -6.7664322808113795, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 6.856327056884766, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.650390625, + "step": 751, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9969167709350586 + }, + { + "episode": 12048, + "epoch": 0.07218607326455045, + "loss/policy_avg": 0.3973958194255829, + "lr": 9.519427402862986e-06, + "objective/entropy": -244.11431884765625, + "objective/kl": 25.62933921813965, + "objective/non_score_reward": -1.2814669609069824, + "objective/rlhf_reward": -3.301039035591196, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 48.01885223388672, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.671875, + "step": 752, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9983410835266113 + }, + { + "episode": 12064, + "epoch": 0.07228193790368001, + "loss/policy_avg": 0.016892850399017334, + "lr": 9.518788343558283e-06, + "objective/entropy": -233.80613708496094, + "objective/kl": 33.0050048828125, + "objective/non_score_reward": -1.6502504348754883, + "objective/rlhf_reward": -4.653590510563786, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 15.416328430175781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.560546875, + "step": 753, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.999025821685791 + }, + { + "episode": 12080, + "epoch": 0.07237780254280955, + "loss/policy_avg": 0.10087546706199646, + "lr": 9.51814928425358e-06, + "objective/entropy": -283.5254211425781, + "objective/kl": 25.051952362060547, + "objective/non_score_reward": -1.2525975704193115, + "objective/rlhf_reward": -2.6103905797004696, + "objective/scores": 0.6, + "policy/approxkl_avg": 19.29462432861328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 754, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9981729984283447 + }, + { + "episode": 12096, + "epoch": 0.07247366718193911, + "loss/policy_avg": 0.24108710885047913, + "lr": 9.517510224948877e-06, + "objective/entropy": -211.13575744628906, + "objective/kl": 35.66078186035156, + "objective/non_score_reward": -1.7830390930175781, + "objective/rlhf_reward": -5.708324392040339, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 11.15980339050293, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.60546875, + "step": 755, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9982430934906006 + }, + { + "episode": 12112, + "epoch": 0.07256953182106865, + "loss/policy_avg": 0.6718421578407288, + "lr": 9.516871165644172e-06, + "objective/entropy": -148.00872802734375, + "objective/kl": 30.348403930664062, + "objective/non_score_reward": -1.5174202919006348, + "objective/rlhf_reward": -4.669681048393249, + "objective/scores": 0.35, + "policy/approxkl_avg": 24.264657974243164, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.626953125, + "step": 756, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989105463027954 + }, + { + "episode": 12128, + "epoch": 0.0726653964601982, + "loss/policy_avg": 0.17684796452522278, + "lr": 9.516232106339469e-06, + "objective/entropy": -220.75283813476562, + "objective/kl": 18.81310272216797, + "objective/non_score_reward": -0.9406551122665405, + "objective/rlhf_reward": -2.3840183998025477, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 3.753880500793457, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.498046875, + "step": 757, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982280731201172 + }, + { + "episode": 12144, + "epoch": 0.07276126109932775, + "loss/policy_avg": 0.5594636797904968, + "lr": 9.515593047034765e-06, + "objective/entropy": -182.7705535888672, + "objective/kl": 19.829849243164062, + "objective/non_score_reward": -0.991492509841919, + "objective/rlhf_reward": 0.4340301394462589, + "objective/scores": 1.1, + "policy/approxkl_avg": 28.46674346923828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.572265625, + "step": 758, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991915225982666 + }, + { + "episode": 12160, + "epoch": 0.0728571257384573, + "loss/policy_avg": 0.6502060890197754, + "lr": 9.514953987730062e-06, + "objective/entropy": -112.33629608154297, + "objective/kl": 39.52580642700195, + "objective/non_score_reward": -1.9762903451919556, + "objective/rlhf_reward": -5.78245514847425, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 4.3783769607543945, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 759, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9975318908691406 + }, + { + "episode": 12176, + "epoch": 0.07295299037758685, + "loss/policy_avg": 0.88495934009552, + "lr": 9.51431492842536e-06, + "objective/entropy": -201.14666748046875, + "objective/kl": 27.90923309326172, + "objective/non_score_reward": -1.3954615592956543, + "objective/rlhf_reward": -4.240210583716064, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 3.1258697509765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 760, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0020923614501953 + }, + { + "episode": 12192, + "epoch": 0.0730488550167164, + "loss/policy_avg": 0.3271714448928833, + "lr": 9.513675869120656e-06, + "objective/entropy": -236.55361938476562, + "objective/kl": 28.77971839904785, + "objective/non_score_reward": -1.43898606300354, + "objective/rlhf_reward": -4.35594413280487, + "objective/scores": 0.35, + "policy/approxkl_avg": 5.469420909881592, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6171875, + "step": 761, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997791051864624 + }, + { + "episode": 12208, + "epoch": 0.07314471965584594, + "loss/policy_avg": 0.0032866448163986206, + "lr": 9.513036809815951e-06, + "objective/entropy": -200.22227478027344, + "objective/kl": 28.73204803466797, + "objective/non_score_reward": -1.4366023540496826, + "objective/rlhf_reward": -4.142289552752095, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 0.5752939581871033, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.587890625, + "step": 762, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0023627281188965 + }, + { + "episode": 12224, + "epoch": 0.0732405842949755, + "loss/policy_avg": 0.21868771314620972, + "lr": 9.512397750511248e-06, + "objective/entropy": -187.9447784423828, + "objective/kl": 20.44854736328125, + "objective/non_score_reward": -1.0224274396896362, + "objective/rlhf_reward": -2.5739379761540255, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 11.789055824279785, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 763, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9979248046875 + }, + { + "episode": 12240, + "epoch": 0.07333644893410504, + "loss/policy_avg": 0.3879333734512329, + "lr": 9.511758691206545e-06, + "objective/entropy": -267.96685791015625, + "objective/kl": 28.91057586669922, + "objective/non_score_reward": -1.4455287456512451, + "objective/rlhf_reward": -3.8347037536668136, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.813044548034668, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6953125, + "step": 764, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0009827613830566 + }, + { + "episode": 12256, + "epoch": 0.0734323135732346, + "loss/policy_avg": 0.06569409370422363, + "lr": 9.511119631901842e-06, + "objective/entropy": -207.83352661132812, + "objective/kl": 24.208805084228516, + "objective/non_score_reward": -1.2104402780532837, + "objective/rlhf_reward": -3.2855019261508733, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.039762258529663, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.658203125, + "step": 765, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0008108615875244 + }, + { + "episode": 12272, + "epoch": 0.07352817821236414, + "loss/policy_avg": 0.9109029769897461, + "lr": 9.510480572597139e-06, + "objective/entropy": -85.82101440429688, + "objective/kl": 31.18517303466797, + "objective/non_score_reward": -1.5592585802078247, + "objective/rlhf_reward": -4.50370092789332, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 4.987689018249512, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.56640625, + "step": 766, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997962236404419 + }, + { + "episode": 12288, + "epoch": 0.0736240428514937, + "loss/policy_avg": 0.44006603956222534, + "lr": 9.509841513292434e-06, + "objective/entropy": -254.5596923828125, + "objective/kl": 26.123559951782227, + "objective/non_score_reward": -1.3061779737472534, + "objective/rlhf_reward": -3.6684524109035284, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 13.005337715148926, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.552734375, + "step": 767, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978489875793457 + }, + { + "episode": 12304, + "epoch": 0.07371990749062324, + "loss/policy_avg": 0.14191022515296936, + "lr": 9.509202453987731e-06, + "objective/entropy": -185.1569061279297, + "objective/kl": 38.093666076660156, + "objective/non_score_reward": -1.9046835899353027, + "objective/rlhf_reward": -7.618734002113342, + "objective/scores": 0.0, + "policy/approxkl_avg": 60.80290603637695, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 768, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9955925941467285 + }, + { + "episode": 12320, + "epoch": 0.0738157721297528, + "loss/policy_avg": -0.31537145376205444, + "lr": 9.508563394683026e-06, + "objective/entropy": -164.9215087890625, + "objective/kl": 30.594449996948242, + "objective/non_score_reward": -1.5297224521636963, + "objective/rlhf_reward": -4.63793725055015, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 1.2754226922988892, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6171875, + "step": 769, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001909017562866 + }, + { + "episode": 12336, + "epoch": 0.07391163676888234, + "loss/policy_avg": 0.034731436520814896, + "lr": 9.507924335378323e-06, + "objective/entropy": -200.43959045410156, + "objective/kl": 36.4830436706543, + "objective/non_score_reward": -1.8241522312164307, + "objective/rlhf_reward": -5.8727765872078805, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 3.3153905868530273, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.626953125, + "step": 770, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0031118392944336 + }, + { + "episode": 12352, + "epoch": 0.07400750140801189, + "loss/policy_avg": 0.29965466260910034, + "lr": 9.50728527607362e-06, + "objective/entropy": -168.58261108398438, + "objective/kl": 34.881736755371094, + "objective/non_score_reward": -1.7440869808197021, + "objective/rlhf_reward": -5.314488296926605, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 12.419918060302734, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.744140625, + "step": 771, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999511957168579 + }, + { + "episode": 12368, + "epoch": 0.07410336604714143, + "loss/policy_avg": 0.5840628743171692, + "lr": 9.506646216768917e-06, + "objective/entropy": -149.50210571289062, + "objective/kl": 26.40768051147461, + "objective/non_score_reward": -1.3203840255737305, + "objective/rlhf_reward": -3.8005837230042214, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 10.453241348266602, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.560546875, + "step": 772, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990813732147217 + }, + { + "episode": 12384, + "epoch": 0.07419923068627099, + "loss/policy_avg": -0.20146791636943817, + "lr": 9.506007157464214e-06, + "objective/entropy": -206.66688537597656, + "objective/kl": 25.146541595458984, + "objective/non_score_reward": -1.2573271989822388, + "objective/rlhf_reward": -5.029308795928955, + "objective/scores": 0.0, + "policy/approxkl_avg": 55.61228561401367, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.728515625, + "step": 773, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993964433670044 + }, + { + "episode": 12400, + "epoch": 0.07429509532540053, + "loss/policy_avg": 2.0998456478118896, + "lr": 9.50536809815951e-06, + "objective/entropy": -135.09249877929688, + "objective/kl": 26.86371612548828, + "objective/non_score_reward": -1.3431859016418457, + "objective/rlhf_reward": -3.922145526023254, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 7.190234184265137, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.548828125, + "step": 774, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0000576972961426 + }, + { + "episode": 12416, + "epoch": 0.07439095996453009, + "loss/policy_avg": 0.024284163489937782, + "lr": 9.504729038854806e-06, + "objective/entropy": -269.6484375, + "objective/kl": 21.226428985595703, + "objective/non_score_reward": -1.061321496963501, + "objective/rlhf_reward": -2.7946879669145197, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 6.07242488861084, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.66796875, + "step": 775, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999997615814209 + }, + { + "episode": 12432, + "epoch": 0.07448682460365963, + "loss/policy_avg": 0.03317616134881973, + "lr": 9.504089979550103e-06, + "objective/entropy": -234.43389892578125, + "objective/kl": 27.79866600036621, + "objective/non_score_reward": -1.3899333477020264, + "objective/rlhf_reward": -3.8264001766840616, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.3638486862182617, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.689453125, + "step": 776, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996023178100586 + }, + { + "episode": 12448, + "epoch": 0.07458268924278919, + "loss/policy_avg": 0.16213266551494598, + "lr": 9.5034509202454e-06, + "objective/entropy": -203.708740234375, + "objective/kl": 38.612911224365234, + "objective/non_score_reward": -1.9306457042694092, + "objective/rlhf_reward": -6.271984438510284, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 1.698218584060669, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.64453125, + "step": 777, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9995825290679932 + }, + { + "episode": 12464, + "epoch": 0.07467855388191873, + "loss/policy_avg": 0.2597602605819702, + "lr": 9.502811860940696e-06, + "objective/entropy": -250.4356231689453, + "objective/kl": 30.581310272216797, + "objective/non_score_reward": -1.529065489768982, + "objective/rlhf_reward": -4.737660029021603, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 8.781853675842285, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.615234375, + "step": 778, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989944696426392 + }, + { + "episode": 12480, + "epoch": 0.07477441852104828, + "loss/policy_avg": -0.24061758816242218, + "lr": 9.502172801635993e-06, + "objective/entropy": -98.61205291748047, + "objective/kl": 26.375612258911133, + "objective/non_score_reward": -1.3187806606292725, + "objective/rlhf_reward": -3.794169786389231, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 5.955351829528809, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.7265625, + "step": 779, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.003558397293091 + }, + { + "episode": 12496, + "epoch": 0.07487028316017783, + "loss/policy_avg": 0.48288995027542114, + "lr": 9.50153374233129e-06, + "objective/entropy": -230.7918701171875, + "objective/kl": 37.52941131591797, + "objective/non_score_reward": -1.8764704465866089, + "objective/rlhf_reward": -6.024929526265025, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 12.408464431762695, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.68359375, + "step": 780, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990174770355225 + }, + { + "episode": 12512, + "epoch": 0.07496614779930738, + "loss/policy_avg": 0.27871203422546387, + "lr": 9.500894683026585e-06, + "objective/entropy": -159.85903930664062, + "objective/kl": 25.038909912109375, + "objective/non_score_reward": -1.2519454956054688, + "objective/rlhf_reward": -2.607781863212585, + "objective/scores": 0.6, + "policy/approxkl_avg": 46.26438903808594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 781, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000070095062256 + }, + { + "episode": 12528, + "epoch": 0.07506201243843692, + "loss/policy_avg": 0.06291055679321289, + "lr": 9.500255623721882e-06, + "objective/entropy": -163.0406494140625, + "objective/kl": 27.101749420166016, + "objective/non_score_reward": -1.3550875186920166, + "objective/rlhf_reward": -4.061099970076961, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 13.61475658416748, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.63671875, + "step": 782, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986028671264648 + }, + { + "episode": 12544, + "epoch": 0.07515787707756648, + "loss/policy_avg": 0.07766500115394592, + "lr": 9.499616564417179e-06, + "objective/entropy": -264.68377685546875, + "objective/kl": 26.38882827758789, + "objective/non_score_reward": -1.319441556930542, + "objective/rlhf_reward": -2.3540468558084697, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 29.816272735595703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 783, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9993165731430054 + }, + { + "episode": 12560, + "epoch": 0.07525374171669602, + "loss/policy_avg": -0.25779616832733154, + "lr": 9.498977505112476e-06, + "objective/entropy": -192.4373016357422, + "objective/kl": 30.569807052612305, + "objective/non_score_reward": -1.528490424156189, + "objective/rlhf_reward": -4.5098417139688305, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 3.409776210784912, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.765625, + "step": 784, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0031237602233887 + }, + { + "episode": 12576, + "epoch": 0.07534960635582558, + "loss/policy_avg": -0.23182180523872375, + "lr": 9.498338445807773e-06, + "objective/entropy": -116.57367706298828, + "objective/kl": 30.319534301757812, + "objective/non_score_reward": -1.5159766674041748, + "objective/rlhf_reward": -4.704657160972042, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 3.2308108806610107, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.3955078125, + "step": 785, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001164436340332 + }, + { + "episode": 12592, + "epoch": 0.07544547099495512, + "loss/policy_avg": 0.270114541053772, + "lr": 9.497699386503068e-06, + "objective/entropy": -213.6279296875, + "objective/kl": 34.02395248413086, + "objective/non_score_reward": -1.701197624206543, + "objective/rlhf_reward": -3.8810713633310527, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 4.159467697143555, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.556640625, + "step": 786, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999849557876587 + }, + { + "episode": 12608, + "epoch": 0.07554133563408467, + "loss/policy_avg": 0.01593317836523056, + "lr": 9.497060327198365e-06, + "objective/entropy": -83.6307601928711, + "objective/kl": 28.397233963012695, + "objective/non_score_reward": -1.4198617935180664, + "objective/rlhf_reward": -4.198494317944407, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 13.974614143371582, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.576171875, + "step": 787, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9974963665008545 + }, + { + "episode": 12624, + "epoch": 0.07563720027321422, + "loss/policy_avg": 0.122782863676548, + "lr": 9.496421267893662e-06, + "objective/entropy": -66.27203369140625, + "objective/kl": 20.0443115234375, + "objective/non_score_reward": -1.0022156238555908, + "objective/rlhf_reward": -2.6302602673448146, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 5.128955364227295, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.78125, + "step": 788, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.002767562866211 + }, + { + "episode": 12640, + "epoch": 0.07573306491234377, + "loss/policy_avg": 0.06789802759885788, + "lr": 9.495782208588959e-06, + "objective/entropy": -174.1296844482422, + "objective/kl": 28.25243377685547, + "objective/non_score_reward": -1.4126217365264893, + "objective/rlhf_reward": -3.988627438963042, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 21.132152557373047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.802734375, + "step": 789, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0010154247283936 + }, + { + "episode": 12656, + "epoch": 0.07582892955147331, + "loss/policy_avg": 0.1666814684867859, + "lr": 9.495143149284254e-06, + "objective/entropy": -226.70257568359375, + "objective/kl": 28.976097106933594, + "objective/non_score_reward": -1.4488048553466797, + "objective/rlhf_reward": -4.371387500961391, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 2.0613138675689697, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62890625, + "step": 790, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994125366210938 + }, + { + "episode": 12672, + "epoch": 0.07592479419060287, + "loss/policy_avg": 0.1284073442220688, + "lr": 9.49450408997955e-06, + "objective/entropy": -215.84002685546875, + "objective/kl": 28.486852645874023, + "objective/non_score_reward": -1.4243427515029907, + "objective/rlhf_reward": -4.355735114126831, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 5.659012317657471, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.64453125, + "step": 791, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002176761627197 + }, + { + "episode": 12688, + "epoch": 0.07602065882973241, + "loss/policy_avg": -0.04723303020000458, + "lr": 9.493865030674848e-06, + "objective/entropy": -227.61280822753906, + "objective/kl": 28.772476196289062, + "objective/non_score_reward": -1.4386236667633057, + "objective/rlhf_reward": -2.830775891185972, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 1.8349313735961914, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.609375, + "step": 792, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.002286911010742 + }, + { + "episode": 12704, + "epoch": 0.07611652346886197, + "loss/policy_avg": -0.01974731869995594, + "lr": 9.493225971370144e-06, + "objective/entropy": -168.45291137695312, + "objective/kl": 32.674957275390625, + "objective/non_score_reward": -1.633747935295105, + "objective/rlhf_reward": -5.209478828936739, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 0.8098639249801636, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 793, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0004916191101074 + }, + { + "episode": 12720, + "epoch": 0.07621238810799151, + "loss/policy_avg": 0.3524478077888489, + "lr": 9.49258691206544e-06, + "objective/entropy": -170.04669189453125, + "objective/kl": 35.1775016784668, + "objective/non_score_reward": -1.7588751316070557, + "objective/rlhf_reward": -5.479241101947382, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 4.70783805847168, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.658203125, + "step": 794, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9981024265289307 + }, + { + "episode": 12736, + "epoch": 0.07630825274712107, + "loss/policy_avg": 0.14937232434749603, + "lr": 9.491947852760736e-06, + "objective/entropy": -258.00518798828125, + "objective/kl": 30.382396697998047, + "objective/non_score_reward": -1.5191197395324707, + "objective/rlhf_reward": -4.472358975473957, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 8.522323608398438, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.68359375, + "step": 795, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.00205659866333 + }, + { + "episode": 12752, + "epoch": 0.07640411738625061, + "loss/policy_avg": 0.4101511240005493, + "lr": 9.491308793456033e-06, + "objective/entropy": -97.3719482421875, + "objective/kl": 49.89447021484375, + "objective/non_score_reward": -2.4947237968444824, + "objective/rlhf_reward": -7.578894591331482, + "objective/scores": 0.6, + "policy/approxkl_avg": 19.377134323120117, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.498046875, + "step": 796, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981412887573242 + }, + { + "episode": 12768, + "epoch": 0.07649998202538016, + "loss/policy_avg": -0.0627971962094307, + "lr": 9.49066973415133e-06, + "objective/entropy": -110.8655776977539, + "objective/kl": 44.73468017578125, + "objective/non_score_reward": -2.23673415184021, + "objective/rlhf_reward": -6.546936726570129, + "objective/scores": 0.6, + "policy/approxkl_avg": 5.804272651672363, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.806640625, + "step": 797, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971017837524414 + }, + { + "episode": 12784, + "epoch": 0.0765958466645097, + "loss/policy_avg": 0.3731452226638794, + "lr": 9.490030674846627e-06, + "objective/entropy": -15.07757568359375, + "objective/kl": 24.15683364868164, + "objective/non_score_reward": -1.2078416347503662, + "objective/rlhf_reward": -2.4313664793968197, + "objective/scores": 0.6, + "policy/approxkl_avg": 5.745340347290039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.638671875, + "step": 798, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993362426757812 + }, + { + "episode": 12800, + "epoch": 0.07669171130363926, + "loss/policy_avg": 0.3336324691772461, + "lr": 9.489391615541922e-06, + "objective/entropy": -249.59414672851562, + "objective/kl": 28.68617820739746, + "objective/non_score_reward": -1.4343090057373047, + "objective/rlhf_reward": -2.8135166510355205, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 3.9479708671569824, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.693359375, + "step": 799, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993374347686768 + }, + { + "episode": 12816, + "epoch": 0.0767875759427688, + "loss/policy_avg": 0.12261458486318588, + "lr": 9.488752556237219e-06, + "objective/entropy": -207.68580627441406, + "objective/kl": 33.91386413574219, + "objective/non_score_reward": -1.6956932544708252, + "objective/rlhf_reward": -5.301820400174021, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 25.18114471435547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 800, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9971894025802612 + }, + { + "episode": 12832, + "epoch": 0.07688344058189836, + "loss/policy_avg": 0.1192292720079422, + "lr": 9.488113496932516e-06, + "objective/entropy": -268.4300842285156, + "objective/kl": 26.710205078125, + "objective/non_score_reward": -1.3355103731155396, + "objective/rlhf_reward": -4.000405719786316, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 4.064979553222656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.638671875, + "step": 801, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9994006156921387 + }, + { + "episode": 12848, + "epoch": 0.0769793052210279, + "loss/policy_avg": 0.4274081587791443, + "lr": 9.487474437627813e-06, + "objective/entropy": -125.00625610351562, + "objective/kl": 36.30561065673828, + "objective/non_score_reward": -1.815280795097351, + "objective/rlhf_reward": -5.901873194907589, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 9.215574264526367, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.55078125, + "step": 802, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999779224395752 + }, + { + "episode": 12864, + "epoch": 0.07707516986015746, + "loss/policy_avg": 0.02082793414592743, + "lr": 9.48683537832311e-06, + "objective/entropy": 49.048545837402344, + "objective/kl": 31.830245971679688, + "objective/non_score_reward": -1.5915122032165527, + "objective/rlhf_reward": -4.915450672717437, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 2.6811680793762207, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4384765625, + "step": 803, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994103908538818 + }, + { + "episode": 12880, + "epoch": 0.077171034499287, + "loss/policy_avg": 0.1582624763250351, + "lr": 9.486196319018407e-06, + "objective/entropy": -110.25260925292969, + "objective/kl": 31.00435447692871, + "objective/non_score_reward": -1.550217866897583, + "objective/rlhf_reward": -3.8008712291717526, + "objective/scores": 0.6, + "policy/approxkl_avg": 3.5253429412841797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.55859375, + "step": 804, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0027151107788086 + }, + { + "episode": 12896, + "epoch": 0.07726689913841656, + "loss/policy_avg": 0.09249435365200043, + "lr": 9.485557259713702e-06, + "objective/entropy": -203.63662719726562, + "objective/kl": 31.04816436767578, + "objective/non_score_reward": -1.552408218383789, + "objective/rlhf_reward": -4.547773247182952, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 1.3485993146896362, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 805, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999431848526001 + }, + { + "episode": 12912, + "epoch": 0.0773627637775461, + "loss/policy_avg": 0.44563794136047363, + "lr": 9.484918200408999e-06, + "objective/entropy": -163.74508666992188, + "objective/kl": 31.982746124267578, + "objective/non_score_reward": -1.599137306213379, + "objective/rlhf_reward": -3.472829972149107, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 87.72571563720703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.70703125, + "step": 806, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001553535461426 + }, + { + "episode": 12928, + "epoch": 0.07745862841667565, + "loss/policy_avg": -0.017649848014116287, + "lr": 9.484279141104296e-06, + "objective/entropy": -266.5451965332031, + "objective/kl": 27.058134078979492, + "objective/non_score_reward": -1.3529068231582642, + "objective/rlhf_reward": -1.0116270542144772, + "objective/scores": 1.1, + "policy/approxkl_avg": 5.037982940673828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 807, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0039749145507812 + }, + { + "episode": 12944, + "epoch": 0.07755449305580521, + "loss/policy_avg": 5.042888641357422, + "lr": 9.483640081799592e-06, + "objective/entropy": -212.65740966796875, + "objective/kl": 24.790084838867188, + "objective/non_score_reward": -1.2395042181015015, + "objective/rlhf_reward": -3.4770642546967263, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 11.046760559082031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.533203125, + "step": 808, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002431869506836 + }, + { + "episode": 12960, + "epoch": 0.07765035769493475, + "loss/policy_avg": -0.07623002678155899, + "lr": 9.48300102249489e-06, + "objective/entropy": -167.7131805419922, + "objective/kl": 31.204689025878906, + "objective/non_score_reward": -1.5602343082427979, + "objective/rlhf_reward": -4.790339152427062, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 5.110037803649902, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5234375, + "step": 809, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989296197891235 + }, + { + "episode": 12976, + "epoch": 0.07774622233406431, + "loss/policy_avg": 0.0697702169418335, + "lr": 9.482361963190185e-06, + "objective/entropy": -99.56057739257812, + "objective/kl": 40.95980453491211, + "objective/non_score_reward": -2.047990322113037, + "objective/rlhf_reward": -5.268241856933805, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 1.0177828073501587, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.595703125, + "step": 810, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999544620513916 + }, + { + "episode": 12992, + "epoch": 0.07784208697319385, + "loss/policy_avg": 0.011765815317630768, + "lr": 9.481722903885481e-06, + "objective/entropy": -270.2078857421875, + "objective/kl": 32.53266906738281, + "objective/non_score_reward": -1.6266334056854248, + "objective/rlhf_reward": -4.950274675098017, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 10.882495880126953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.685546875, + "step": 811, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997708797454834 + }, + { + "episode": 13008, + "epoch": 0.0779379516123234, + "loss/policy_avg": 0.4012794494628906, + "lr": 9.481083844580777e-06, + "objective/entropy": -139.22914123535156, + "objective/kl": 37.05573272705078, + "objective/non_score_reward": -1.8527867794036865, + "objective/rlhf_reward": -5.586318249973367, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 210.83877563476562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6640625, + "step": 812, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001636505126953 + }, + { + "episode": 13024, + "epoch": 0.07803381625145295, + "loss/policy_avg": 0.2699980139732361, + "lr": 9.480444785276073e-06, + "objective/entropy": -196.59963989257812, + "objective/kl": 30.699893951416016, + "objective/non_score_reward": -1.5349947214126587, + "objective/rlhf_reward": -3.7399788856506344, + "objective/scores": 0.6, + "policy/approxkl_avg": 2.332146167755127, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.619140625, + "step": 813, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989255666732788 + }, + { + "episode": 13040, + "epoch": 0.0781296808905825, + "loss/policy_avg": 0.20207370817661285, + "lr": 9.47980572597137e-06, + "objective/entropy": -267.2593994140625, + "objective/kl": 33.34029006958008, + "objective/non_score_reward": -1.6670145988464355, + "objective/rlhf_reward": -5.342545185118837, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 1.632169246673584, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.564453125, + "step": 814, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993088245391846 + }, + { + "episode": 13056, + "epoch": 0.07822554552971205, + "loss/policy_avg": 0.1745888739824295, + "lr": 9.479166666666667e-06, + "objective/entropy": -108.20680236816406, + "objective/kl": 35.203025817871094, + "objective/non_score_reward": -1.7601512670516968, + "objective/rlhf_reward": -5.484345762935236, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 8.32550048828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.736328125, + "step": 815, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0030176639556885 + }, + { + "episode": 13072, + "epoch": 0.0783214101688416, + "loss/policy_avg": 0.2600640654563904, + "lr": 9.478527607361964e-06, + "objective/entropy": -204.03048706054688, + "objective/kl": 40.41114807128906, + "objective/non_score_reward": -2.020557403564453, + "objective/rlhf_reward": -6.74059360316339, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 4.140628814697266, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.619140625, + "step": 816, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0000810623168945 + }, + { + "episode": 13088, + "epoch": 0.07841727480797114, + "loss/policy_avg": 0.5273202061653137, + "lr": 9.477888548057261e-06, + "objective/entropy": -241.156494140625, + "objective/kl": 24.541404724121094, + "objective/non_score_reward": -1.2270702123641968, + "objective/rlhf_reward": -3.246421401918517, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 14.965031623840332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.61328125, + "step": 817, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9990627765655518 + }, + { + "episode": 13104, + "epoch": 0.0785131394471007, + "loss/policy_avg": -0.09151424467563629, + "lr": 9.477249488752556e-06, + "objective/entropy": -219.21754455566406, + "objective/kl": 31.261905670166016, + "objective/non_score_reward": -1.5630953311920166, + "objective/rlhf_reward": -4.4275525763359775, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 2.8227334022521973, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.833984375, + "step": 818, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0036370754241943 + }, + { + "episode": 13120, + "epoch": 0.07860900408623024, + "loss/policy_avg": 0.13953115046024323, + "lr": 9.476610429447853e-06, + "objective/entropy": -186.8937530517578, + "objective/kl": 27.69632339477539, + "objective/non_score_reward": -1.3848161697387695, + "objective/rlhf_reward": -3.1392647981643673, + "objective/scores": 0.6, + "policy/approxkl_avg": 3.2056455612182617, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 819, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.003466844558716 + }, + { + "episode": 13136, + "epoch": 0.0787048687253598, + "loss/policy_avg": 0.6420396566390991, + "lr": 9.47597137014315e-06, + "objective/entropy": -134.00025939941406, + "objective/kl": 22.993852615356445, + "objective/non_score_reward": -1.1496926546096802, + "objective/rlhf_reward": -2.651359389500554, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 3.607414722442627, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 820, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000748634338379 + }, + { + "episode": 13152, + "epoch": 0.07880073336448934, + "loss/policy_avg": 0.08356916159391403, + "lr": 9.475332310838447e-06, + "objective/entropy": -189.72003173828125, + "objective/kl": 26.506973266601562, + "objective/non_score_reward": -1.3253486156463623, + "objective/rlhf_reward": -3.959759166746765, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 4.290050029754639, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 821, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9971721172332764 + }, + { + "episode": 13168, + "epoch": 0.0788965980036189, + "loss/policy_avg": 0.11917827278375626, + "lr": 9.474693251533744e-06, + "objective/entropy": -207.30722045898438, + "objective/kl": 35.41877746582031, + "objective/non_score_reward": -1.7709391117095947, + "objective/rlhf_reward": -5.683756327629089, + "objective/scores": 0.35, + "policy/approxkl_avg": 6.870448112487793, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 822, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9955227375030518 + }, + { + "episode": 13184, + "epoch": 0.07899246264274844, + "loss/policy_avg": -0.3528624475002289, + "lr": 9.474054192229039e-06, + "objective/entropy": -138.19627380371094, + "objective/kl": 27.491954803466797, + "objective/non_score_reward": -1.3745976686477661, + "objective/rlhf_reward": -4.156755199938446, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 1.025694727897644, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.52734375, + "step": 823, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0017786026000977 + }, + { + "episode": 13200, + "epoch": 0.079088327281878, + "loss/policy_avg": 0.3965766727924347, + "lr": 9.473415132924336e-06, + "objective/entropy": -244.5587921142578, + "objective/kl": 28.361434936523438, + "objective/non_score_reward": -1.4180717468261719, + "objective/rlhf_reward": -3.549580337778602, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 22.821792602539062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.53515625, + "step": 824, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.997540831565857 + }, + { + "episode": 13216, + "epoch": 0.07918419192100754, + "loss/policy_avg": 0.183881938457489, + "lr": 9.472776073619633e-06, + "objective/entropy": -235.81063842773438, + "objective/kl": 35.635047912597656, + "objective/non_score_reward": -1.7817524671554565, + "objective/rlhf_reward": -5.785373976736694, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 13.993101119995117, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 825, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987282752990723 + }, + { + "episode": 13232, + "epoch": 0.07928005656013709, + "loss/policy_avg": 0.13472305238246918, + "lr": 9.47213701431493e-06, + "objective/entropy": -209.61251831054688, + "objective/kl": 32.511722564697266, + "objective/non_score_reward": -1.6255862712860107, + "objective/rlhf_reward": -5.176832351714296, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 10.514575958251953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.693359375, + "step": 826, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980220794677734 + }, + { + "episode": 13248, + "epoch": 0.07937592119926663, + "loss/policy_avg": 0.28974202275276184, + "lr": 9.471497955010226e-06, + "objective/entropy": -277.55413818359375, + "objective/kl": 23.343517303466797, + "objective/non_score_reward": -1.1671757698059082, + "objective/rlhf_reward": -4.668703377246857, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.868777275085449, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.595703125, + "step": 827, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984409809112549 + }, + { + "episode": 13264, + "epoch": 0.07947178583839619, + "loss/policy_avg": 0.0649593323469162, + "lr": 9.470858895705523e-06, + "objective/entropy": -168.09161376953125, + "objective/kl": 32.58544921875, + "objective/non_score_reward": -1.6292723417282104, + "objective/rlhf_reward": -5.001317584308323, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 13.682709693908691, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.630859375, + "step": 828, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994233846664429 + }, + { + "episode": 13280, + "epoch": 0.07956765047752573, + "loss/policy_avg": 0.01223127543926239, + "lr": 9.470219836400818e-06, + "objective/entropy": -24.52312469482422, + "objective/kl": 37.070613861083984, + "objective/non_score_reward": -1.8535306453704834, + "objective/rlhf_reward": -5.963524679751739, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 1.4948031902313232, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.591796875, + "step": 829, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0006215572357178 + }, + { + "episode": 13296, + "epoch": 0.07966351511665529, + "loss/policy_avg": 0.08012821525335312, + "lr": 9.469580777096115e-06, + "objective/entropy": -222.74710083007812, + "objective/kl": 29.31437873840332, + "objective/non_score_reward": -1.4657189846038818, + "objective/rlhf_reward": -5.862875819206238, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.948197364807129, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.720703125, + "step": 830, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999361515045166 + }, + { + "episode": 13312, + "epoch": 0.07975937975578483, + "loss/policy_avg": 0.25244101881980896, + "lr": 9.468941717791412e-06, + "objective/entropy": -256.2400817871094, + "objective/kl": 25.82564926147461, + "objective/non_score_reward": -1.2912824153900146, + "objective/rlhf_reward": -5.165129542350769, + "objective/scores": 0.0, + "policy/approxkl_avg": 25.767894744873047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.625, + "step": 831, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9958332777023315 + }, + { + "episode": 13328, + "epoch": 0.07985524439491438, + "loss/policy_avg": 0.20151713490486145, + "lr": 9.468302658486709e-06, + "objective/entropy": -176.53012084960938, + "objective/kl": 31.989328384399414, + "objective/non_score_reward": -1.5994665622711182, + "objective/rlhf_reward": -4.94726787051712, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 6.573209762573242, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 832, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0001397132873535 + }, + { + "episode": 13344, + "epoch": 0.07995110903404393, + "loss/policy_avg": 0.40637868642807007, + "lr": 9.467663599182006e-06, + "objective/entropy": -157.83944702148438, + "objective/kl": 26.236248016357422, + "objective/non_score_reward": -1.311812400817871, + "objective/rlhf_reward": -2.847249662876129, + "objective/scores": 0.6, + "policy/approxkl_avg": 41.408966064453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69921875, + "step": 833, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9979543685913086 + }, + { + "episode": 13360, + "epoch": 0.08004697367317348, + "loss/policy_avg": 0.4117756485939026, + "lr": 9.467024539877301e-06, + "objective/entropy": -154.52528381347656, + "objective/kl": 34.40885925292969, + "objective/non_score_reward": -1.7204430103302002, + "objective/rlhf_reward": -3.958053027034971, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.24909782409668, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.552734375, + "step": 834, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996882677078247 + }, + { + "episode": 13376, + "epoch": 0.08014283831230302, + "loss/policy_avg": 0.25968849658966064, + "lr": 9.466385480572598e-06, + "objective/entropy": -35.48725509643555, + "objective/kl": 48.416969299316406, + "objective/non_score_reward": -2.4208483695983887, + "objective/rlhf_reward": -8.324143612121029, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 7.6608781814575195, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4736328125, + "step": 835, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974427223205566 + }, + { + "episode": 13392, + "epoch": 0.08023870295143258, + "loss/policy_avg": 0.6013174057006836, + "lr": 9.465746421267893e-06, + "objective/entropy": -131.218994140625, + "objective/kl": 40.460113525390625, + "objective/non_score_reward": -2.023005723953247, + "objective/rlhf_reward": -6.267194564613412, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 5.2574968338012695, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.765625, + "step": 836, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985507726669312 + }, + { + "episode": 13408, + "epoch": 0.08033456759056212, + "loss/policy_avg": 0.024118170142173767, + "lr": 9.46510736196319e-06, + "objective/entropy": -219.2191162109375, + "objective/kl": 37.4605712890625, + "objective/non_score_reward": -1.8730283975601196, + "objective/rlhf_reward": -5.544702480511601, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.715839385986328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.603515625, + "step": 837, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.003450393676758 + }, + { + "episode": 13424, + "epoch": 0.08043043222969168, + "loss/policy_avg": 0.3022631108760834, + "lr": 9.464468302658487e-06, + "objective/entropy": -122.02997589111328, + "objective/kl": 32.87577438354492, + "objective/non_score_reward": -1.6437886953353882, + "objective/rlhf_reward": -5.196552612868649, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 3.3451852798461914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 838, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992682933807373 + }, + { + "episode": 13440, + "epoch": 0.08052629686882122, + "loss/policy_avg": 0.09435372054576874, + "lr": 9.463829243353784e-06, + "objective/entropy": -228.3193817138672, + "objective/kl": 27.057086944580078, + "objective/non_score_reward": -1.3528543710708618, + "objective/rlhf_reward": -3.7495579771405323, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 64.43006896972656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.642578125, + "step": 839, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9953296184539795 + }, + { + "episode": 13456, + "epoch": 0.08062216150795078, + "loss/policy_avg": 1.2935261726379395, + "lr": 9.46319018404908e-06, + "objective/entropy": -160.080322265625, + "objective/kl": 34.4007568359375, + "objective/non_score_reward": -1.7200379371643066, + "objective/rlhf_reward": -5.538515916376738, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 131.64187622070312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6015625, + "step": 840, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983408451080322 + }, + { + "episode": 13472, + "epoch": 0.08071802614708032, + "loss/policy_avg": 0.5178288817405701, + "lr": 9.462551124744378e-06, + "objective/entropy": -140.98907470703125, + "objective/kl": 32.42417526245117, + "objective/non_score_reward": -1.621208906173706, + "objective/rlhf_reward": -4.084835386276245, + "objective/scores": 0.6, + "policy/approxkl_avg": 2.9638893604278564, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 841, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993921518325806 + }, + { + "episode": 13488, + "epoch": 0.08081389078620987, + "loss/policy_avg": 1.674887776374817, + "lr": 9.461912065439673e-06, + "objective/entropy": -140.6572723388672, + "objective/kl": 33.64493179321289, + "objective/non_score_reward": -1.682246446609497, + "objective/rlhf_reward": -2.3289861440658566, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.7393760681152344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.73828125, + "step": 842, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0029544830322266 + }, + { + "episode": 13504, + "epoch": 0.08090975542533942, + "loss/policy_avg": 0.10809826105833054, + "lr": 9.46127300613497e-06, + "objective/entropy": 33.49109649658203, + "objective/kl": 46.121177673339844, + "objective/non_score_reward": -2.3060591220855713, + "objective/rlhf_reward": -7.399407501491616, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 10.247078895568848, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7578125, + "step": 843, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9997997283935547 + }, + { + "episode": 13520, + "epoch": 0.08100562006446897, + "loss/policy_avg": 0.08235388994216919, + "lr": 9.460633946830267e-06, + "objective/entropy": -232.94918823242188, + "objective/kl": 29.242427825927734, + "objective/non_score_reward": -1.4621214866638184, + "objective/rlhf_reward": -5.848485827445984, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.9668121337890625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.708984375, + "step": 844, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998790979385376 + }, + { + "episode": 13536, + "epoch": 0.08110148470359851, + "loss/policy_avg": 0.15575401484966278, + "lr": 9.459994887525563e-06, + "objective/entropy": -230.47235107421875, + "objective/kl": 39.588829040527344, + "objective/non_score_reward": -1.9794416427612305, + "objective/rlhf_reward": -6.401994669231113, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 3.700314521789551, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.708984375, + "step": 845, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9979462623596191 + }, + { + "episode": 13552, + "epoch": 0.08119734934272807, + "loss/policy_avg": 0.13659973442554474, + "lr": 9.45935582822086e-06, + "objective/entropy": -174.33474731445312, + "objective/kl": 28.351903915405273, + "objective/non_score_reward": -1.4175951480865479, + "objective/rlhf_reward": -2.746661697269651, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.24754524230957, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 846, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0036792755126953 + }, + { + "episode": 13568, + "epoch": 0.08129321398185761, + "loss/policy_avg": -0.0010715574026107788, + "lr": 9.458716768916156e-06, + "objective/entropy": -106.94636535644531, + "objective/kl": 43.695289611816406, + "objective/non_score_reward": -2.1847643852233887, + "objective/rlhf_reward": -7.077198391378509, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 4.114851474761963, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859375, + "step": 847, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0022311210632324 + }, + { + "episode": 13584, + "epoch": 0.08138907862098717, + "loss/policy_avg": -0.020745811983942986, + "lr": 9.458077709611452e-06, + "objective/entropy": -274.30377197265625, + "objective/kl": 29.099441528320312, + "objective/non_score_reward": -1.4549720287322998, + "objective/rlhf_reward": -4.215768191877919, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 5.374234199523926, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55078125, + "step": 848, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0002601146698 + }, + { + "episode": 13600, + "epoch": 0.08148494326011671, + "loss/policy_avg": 0.08369505405426025, + "lr": 9.45743865030675e-06, + "objective/entropy": -90.9344482421875, + "objective/kl": 32.62782669067383, + "objective/non_score_reward": -1.6313912868499756, + "objective/rlhf_reward": -4.921445462767201, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.873699426651001, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 849, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983861446380615 + }, + { + "episode": 13616, + "epoch": 0.08158080789924627, + "loss/policy_avg": 0.12610237300395966, + "lr": 9.456799591002046e-06, + "objective/entropy": -216.01071166992188, + "objective/kl": 31.95155906677246, + "objective/non_score_reward": -1.5975778102874756, + "objective/rlhf_reward": -5.048675945311218, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 17.690187454223633, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.630859375, + "step": 850, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9975669384002686 + }, + { + "episode": 13632, + "epoch": 0.08167667253837581, + "loss/policy_avg": 0.09207138419151306, + "lr": 9.456160531697343e-06, + "objective/entropy": -213.504638671875, + "objective/kl": 33.958152770996094, + "objective/non_score_reward": -1.69790780544281, + "objective/rlhf_reward": -5.413029053298336, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 2.278407096862793, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.712890625, + "step": 851, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982938766479492 + }, + { + "episode": 13648, + "epoch": 0.08177253717750536, + "loss/policy_avg": 0.7879657745361328, + "lr": 9.45552147239264e-06, + "objective/entropy": -179.40536499023438, + "objective/kl": 38.20147705078125, + "objective/non_score_reward": -1.91007399559021, + "objective/rlhf_reward": -6.216463644702998, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 3.275893211364746, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.52734375, + "step": 852, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000800609588623 + }, + { + "episode": 13664, + "epoch": 0.0818684018166349, + "loss/policy_avg": -0.05168546736240387, + "lr": 9.454882413087935e-06, + "objective/entropy": -252.6636505126953, + "objective/kl": 36.603004455566406, + "objective/non_score_reward": -1.8301501274108887, + "objective/rlhf_reward": -5.65874100250064, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 1.8799333572387695, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.734375, + "step": 853, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000197410583496 + }, + { + "episode": 13680, + "epoch": 0.08196426645576446, + "loss/policy_avg": 0.35540589690208435, + "lr": 9.454243353783232e-06, + "objective/entropy": -263.326171875, + "objective/kl": 31.936683654785156, + "objective/non_score_reward": -1.5968341827392578, + "objective/rlhf_reward": -5.0618239379226395, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 9.10447883605957, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.591796875, + "step": 854, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981427192687988 + }, + { + "episode": 13696, + "epoch": 0.082060131094894, + "loss/policy_avg": -0.01314299926161766, + "lr": 9.453604294478529e-06, + "objective/entropy": -50.54448699951172, + "objective/kl": 27.010623931884766, + "objective/non_score_reward": -1.3505312204360962, + "objective/rlhf_reward": -4.002124941349029, + "objective/scores": 0.35, + "policy/approxkl_avg": 72.71121215820312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.755859375, + "step": 855, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988174438476562 + }, + { + "episode": 13712, + "epoch": 0.08215599573402356, + "loss/policy_avg": 0.507459282875061, + "lr": 9.452965235173824e-06, + "objective/entropy": -196.7661590576172, + "objective/kl": 41.39533615112305, + "objective/non_score_reward": -2.0697667598724365, + "objective/rlhf_reward": -6.331655929760869, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 6.633426666259766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.734375, + "step": 856, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9997053146362305 + }, + { + "episode": 13728, + "epoch": 0.0822518603731531, + "loss/policy_avg": 0.01022842712700367, + "lr": 9.452326175869121e-06, + "objective/entropy": -165.575439453125, + "objective/kl": 28.162111282348633, + "objective/non_score_reward": -1.408105492591858, + "objective/rlhf_reward": -4.253819801894528, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 2.566072463989258, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.673828125, + "step": 857, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994425773620605 + }, + { + "episode": 13744, + "epoch": 0.08234772501228266, + "loss/policy_avg": 0.5199975371360779, + "lr": 9.451687116564418e-06, + "objective/entropy": -191.289794921875, + "objective/kl": 25.639848709106445, + "objective/non_score_reward": -1.2819924354553223, + "objective/rlhf_reward": -3.6121978996121253, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 2.2938754558563232, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4873046875, + "step": 858, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993865489959717 + }, + { + "episode": 13760, + "epoch": 0.0824435896514122, + "loss/policy_avg": -0.09089094400405884, + "lr": 9.451048057259715e-06, + "objective/entropy": -222.6432647705078, + "objective/kl": 35.101905822753906, + "objective/non_score_reward": -1.7550954818725586, + "objective/rlhf_reward": -5.641779580203396, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 1.5215179920196533, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.701171875, + "step": 859, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0004208087921143 + }, + { + "episode": 13776, + "epoch": 0.08253945429054176, + "loss/policy_avg": 0.3994244635105133, + "lr": 9.45040899795501e-06, + "objective/entropy": -232.05795288085938, + "objective/kl": 35.13083267211914, + "objective/non_score_reward": -1.7565417289733887, + "objective/rlhf_reward": -2.6261669158935543, + "objective/scores": 1.1, + "policy/approxkl_avg": 7.337094306945801, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.736328125, + "step": 860, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0000622272491455 + }, + { + "episode": 13792, + "epoch": 0.0826353189296713, + "loss/policy_avg": 0.241072878241539, + "lr": 9.449769938650307e-06, + "objective/entropy": -235.5231475830078, + "objective/kl": 42.96981430053711, + "objective/non_score_reward": -2.1484906673431396, + "objective/rlhf_reward": -6.860629336039224, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 5.666136264801025, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 861, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980902671813965 + }, + { + "episode": 13808, + "epoch": 0.08273118356880085, + "loss/policy_avg": 0.06892701238393784, + "lr": 9.449130879345604e-06, + "objective/entropy": -43.37392044067383, + "objective/kl": 28.94279670715332, + "objective/non_score_reward": -1.447139859199524, + "objective/rlhf_reward": -4.446923902540832, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 13.682140350341797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.873046875, + "step": 862, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998504638671875 + }, + { + "episode": 13824, + "epoch": 0.0828270482079304, + "loss/policy_avg": 0.05104389786720276, + "lr": 9.4484918200409e-06, + "objective/entropy": -274.24462890625, + "objective/kl": 26.58008575439453, + "objective/non_score_reward": -1.3290044069290161, + "objective/rlhf_reward": -3.6541578821545704, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 14.673041343688965, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 863, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998407244682312 + }, + { + "episode": 13840, + "epoch": 0.08292291284705995, + "loss/policy_avg": 2.0433521270751953, + "lr": 9.447852760736197e-06, + "objective/entropy": -141.08175659179688, + "objective/kl": 38.66474151611328, + "objective/non_score_reward": -1.933237075805664, + "objective/rlhf_reward": -6.282350401492462, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 3.4866771697998047, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.849609375, + "step": 864, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0020241737365723 + }, + { + "episode": 13856, + "epoch": 0.08301877748618951, + "loss/policy_avg": 0.5822303891181946, + "lr": 9.447213701431494e-06, + "objective/entropy": -75.44483184814453, + "objective/kl": 42.41981506347656, + "objective/non_score_reward": -2.1209909915924072, + "objective/rlhf_reward": -7.033365587802276, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 1.0502395629882812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.763671875, + "step": 865, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0002496242523193 + }, + { + "episode": 13872, + "epoch": 0.08311464212531905, + "loss/policy_avg": 1.5961978435516357, + "lr": 9.44657464212679e-06, + "objective/entropy": -102.62336730957031, + "objective/kl": 32.63710021972656, + "objective/non_score_reward": -1.6318550109863281, + "objective/rlhf_reward": -4.702591176303934, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 44.34449005126953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.583984375, + "step": 866, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997666597366333 + }, + { + "episode": 13888, + "epoch": 0.0832105067644486, + "loss/policy_avg": -0.06377097964286804, + "lr": 9.445935582822086e-06, + "objective/entropy": -179.53016662597656, + "objective/kl": 27.1846981048584, + "objective/non_score_reward": -1.3592349290847778, + "objective/rlhf_reward": -3.3142334840455394, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 11.25791072845459, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.67578125, + "step": 867, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001521110534668 + }, + { + "episode": 13904, + "epoch": 0.08330637140357815, + "loss/policy_avg": 0.06122337281703949, + "lr": 9.445296523517383e-06, + "objective/entropy": -160.8975830078125, + "objective/kl": 37.28607940673828, + "objective/non_score_reward": -1.8643040657043457, + "objective/rlhf_reward": -6.131703171759767, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 2.380110263824463, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.685546875, + "step": 868, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993302822113037 + }, + { + "episode": 13920, + "epoch": 0.0834022360427077, + "loss/policy_avg": 0.06397978216409683, + "lr": 9.44465746421268e-06, + "objective/entropy": -279.75146484375, + "objective/kl": 36.54051971435547, + "objective/non_score_reward": -1.8270260095596313, + "objective/rlhf_reward": -5.3606928093003585, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 9.166413307189941, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6953125, + "step": 869, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998555064201355 + }, + { + "episode": 13936, + "epoch": 0.08349810068183725, + "loss/policy_avg": 0.18339544534683228, + "lr": 9.444018404907977e-06, + "objective/entropy": -197.06088256835938, + "objective/kl": 35.413883209228516, + "objective/non_score_reward": -1.7706942558288574, + "objective/rlhf_reward": -5.420917516172516, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 2.4228196144104004, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.55859375, + "step": 870, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9997130632400513 + }, + { + "episode": 13952, + "epoch": 0.0835939653209668, + "loss/policy_avg": 0.7395508885383606, + "lr": 9.443379345603272e-06, + "objective/entropy": -175.5420684814453, + "objective/kl": 27.310260772705078, + "objective/non_score_reward": -1.3655130863189697, + "objective/rlhf_reward": -3.9057928611903936, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 20.016393661499023, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.744140625, + "step": 871, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9980931282043457 + }, + { + "episode": 13968, + "epoch": 0.08368982996009634, + "loss/policy_avg": 0.11419187486171722, + "lr": 9.442740286298569e-06, + "objective/entropy": -202.19219970703125, + "objective/kl": 26.73446273803711, + "objective/non_score_reward": -1.3367230892181396, + "objective/rlhf_reward": -0.9468923568725582, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.4593892097473145, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.505859375, + "step": 872, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9997605085372925 + }, + { + "episode": 13984, + "epoch": 0.0837856945992259, + "loss/policy_avg": 0.10254265367984772, + "lr": 9.442101226993866e-06, + "objective/entropy": -181.49607849121094, + "objective/kl": 34.489620208740234, + "objective/non_score_reward": -1.7244811058044434, + "objective/rlhf_reward": -5.2938043213525585, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 10.685236930847168, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.572265625, + "step": 873, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004138946533203 + }, + { + "episode": 14000, + "epoch": 0.08388155923835544, + "loss/policy_avg": -0.11048807948827744, + "lr": 9.441462167689163e-06, + "objective/entropy": -233.58718872070312, + "objective/kl": 27.196325302124023, + "objective/non_score_reward": -1.359816312789917, + "objective/rlhf_reward": -4.080015146468563, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 7.074767112731934, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.599609375, + "step": 874, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000337600708008 + }, + { + "episode": 14016, + "epoch": 0.083977423877485, + "loss/policy_avg": -0.04991217330098152, + "lr": 9.44082310838446e-06, + "objective/entropy": -147.29574584960938, + "objective/kl": 39.145992279052734, + "objective/non_score_reward": -1.9572995901107788, + "objective/rlhf_reward": -6.429198360443115, + "objective/scores": 0.35, + "policy/approxkl_avg": 2.3655714988708496, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.703125, + "step": 875, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0024516582489014 + }, + { + "episode": 14032, + "epoch": 0.08407328851661454, + "loss/policy_avg": 0.018214020878076553, + "lr": 9.440184049079757e-06, + "objective/entropy": -225.25274658203125, + "objective/kl": 28.496929168701172, + "objective/non_score_reward": -1.4248464107513428, + "objective/rlhf_reward": -4.248787502856597, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 8.280494689941406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.78515625, + "step": 876, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0028023719787598 + }, + { + "episode": 14048, + "epoch": 0.0841691531557441, + "loss/policy_avg": -0.0712839737534523, + "lr": 9.439544989775052e-06, + "objective/entropy": -111.49925231933594, + "objective/kl": 33.307395935058594, + "objective/non_score_reward": -1.6653697490692139, + "objective/rlhf_reward": -5.237647135456172, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 10.649118423461914, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.744140625, + "step": 877, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0005428791046143 + }, + { + "episode": 14064, + "epoch": 0.08426501779487364, + "loss/policy_avg": 0.416260302066803, + "lr": 9.438905930470349e-06, + "objective/entropy": -91.5921630859375, + "objective/kl": 36.07551193237305, + "objective/non_score_reward": -1.8037755489349365, + "objective/rlhf_reward": -5.658842890468195, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 14.971528053283691, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.546875, + "step": 878, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99779212474823 + }, + { + "episode": 14080, + "epoch": 0.08436088243400319, + "loss/policy_avg": 0.6945221424102783, + "lr": 9.438266871165644e-06, + "objective/entropy": -103.2996597290039, + "objective/kl": 29.02838706970215, + "objective/non_score_reward": -1.4514193534851074, + "objective/rlhf_reward": -4.249418287482813, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 3.5951104164123535, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6015625, + "step": 879, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9996235370635986 + }, + { + "episode": 14096, + "epoch": 0.08445674707313273, + "loss/policy_avg": 0.14096970856189728, + "lr": 9.43762781186094e-06, + "objective/entropy": -250.6915283203125, + "objective/kl": 24.03522491455078, + "objective/non_score_reward": -1.2017613649368286, + "objective/rlhf_reward": -3.4284433508790553, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 141.8468017578125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.681640625, + "step": 880, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993736743927002 + }, + { + "episode": 14112, + "epoch": 0.08455261171226229, + "loss/policy_avg": 0.3699185848236084, + "lr": 9.436988752556238e-06, + "objective/entropy": -159.3045196533203, + "objective/kl": 40.019386291503906, + "objective/non_score_reward": -2.000969409942627, + "objective/rlhf_reward": -6.553278903575286, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 8.20317554473877, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.775390625, + "step": 881, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977927207946777 + }, + { + "episode": 14128, + "epoch": 0.08464847635139183, + "loss/policy_avg": 0.41995298862457275, + "lr": 9.436349693251534e-06, + "objective/entropy": 76.95626068115234, + "objective/kl": 39.00627899169922, + "objective/non_score_reward": -1.9503138065338135, + "objective/rlhf_reward": -6.139395838201629, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 31.75859832763672, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.568359375, + "step": 882, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9953019618988037 + }, + { + "episode": 14144, + "epoch": 0.08474434099052139, + "loss/policy_avg": 0.5355075597763062, + "lr": 9.435710633946831e-06, + "objective/entropy": -164.35186767578125, + "objective/kl": 42.27740478515625, + "objective/non_score_reward": -2.113870143890381, + "objective/rlhf_reward": -7.113845041304259, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 20.66805648803711, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.72265625, + "step": 883, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9979139566421509 + }, + { + "episode": 14160, + "epoch": 0.08484020562965093, + "loss/policy_avg": 0.12046757340431213, + "lr": 9.435071574642126e-06, + "objective/entropy": -139.48226928710938, + "objective/kl": 35.96446228027344, + "objective/non_score_reward": -1.7982230186462402, + "objective/rlhf_reward": -5.833642208312435, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 5.584999084472656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.763671875, + "step": 884, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985802173614502 + }, + { + "episode": 14176, + "epoch": 0.08493607026878049, + "loss/policy_avg": 0.20259422063827515, + "lr": 9.434432515337423e-06, + "objective/entropy": -194.32472229003906, + "objective/kl": 29.422592163085938, + "objective/non_score_reward": -1.4711295366287231, + "objective/rlhf_reward": -4.151184813181559, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 7.590093612670898, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.705078125, + "step": 885, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0000078678131104 + }, + { + "episode": 14192, + "epoch": 0.08503193490791003, + "loss/policy_avg": 0.38378089666366577, + "lr": 9.43379345603272e-06, + "objective/entropy": -204.80718994140625, + "objective/kl": 26.858444213867188, + "objective/non_score_reward": -1.3429222106933594, + "objective/rlhf_reward": -3.947856862743465, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 54.279869079589844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8828125, + "step": 886, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000415086746216 + }, + { + "episode": 14208, + "epoch": 0.08512779954703958, + "loss/policy_avg": 0.27804744243621826, + "lr": 9.433154396728017e-06, + "objective/entropy": -216.76026916503906, + "objective/kl": 31.35245704650879, + "objective/non_score_reward": -1.5676229000091553, + "objective/rlhf_reward": -4.928855529337554, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 44.15214157104492, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.841796875, + "step": 887, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9985570907592773 + }, + { + "episode": 14224, + "epoch": 0.08522366418616913, + "loss/policy_avg": 0.1285010725259781, + "lr": 9.432515337423314e-06, + "objective/entropy": -256.2292175292969, + "objective/kl": 22.457351684570312, + "objective/non_score_reward": -1.1228675842285156, + "objective/rlhf_reward": -2.5440589291619613, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.694319725036621, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.591796875, + "step": 888, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9982428550720215 + }, + { + "episode": 14240, + "epoch": 0.08531952882529868, + "loss/policy_avg": 0.1620079129934311, + "lr": 9.431876278118611e-06, + "objective/entropy": -246.3665313720703, + "objective/kl": 32.27862548828125, + "objective/non_score_reward": -1.6139311790466309, + "objective/rlhf_reward": -5.03189285536584, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 4.128833293914795, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.626953125, + "step": 889, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0017335414886475 + }, + { + "episode": 14256, + "epoch": 0.08541539346442822, + "loss/policy_avg": 0.6714350581169128, + "lr": 9.431237218813906e-06, + "objective/entropy": -87.00444793701172, + "objective/kl": 30.12160873413086, + "objective/non_score_reward": -1.5060807466506958, + "objective/rlhf_reward": -4.600490768154231, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 30.543041229248047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 890, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9955544471740723 + }, + { + "episode": 14272, + "epoch": 0.08551125810355778, + "loss/policy_avg": 0.5368032455444336, + "lr": 9.430598159509203e-06, + "objective/entropy": -151.2410125732422, + "objective/kl": 23.1306095123291, + "objective/non_score_reward": -1.1565306186676025, + "objective/rlhf_reward": -3.266872340176983, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 18.648775100708008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 891, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999136209487915 + }, + { + "episode": 14288, + "epoch": 0.08560712274268732, + "loss/policy_avg": -0.4043048024177551, + "lr": 9.4299591002045e-06, + "objective/entropy": -214.12281799316406, + "objective/kl": 38.173484802246094, + "objective/non_score_reward": -1.9086743593215942, + "objective/rlhf_reward": -5.972837810934173, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 3.6675243377685547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.580078125, + "step": 892, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000812292098999 + }, + { + "episode": 14304, + "epoch": 0.08570298738181688, + "loss/policy_avg": 1.0885683298110962, + "lr": 9.429320040899797e-06, + "objective/entropy": -234.37998962402344, + "objective/kl": 27.995094299316406, + "objective/non_score_reward": -1.3997547626495361, + "objective/rlhf_reward": -4.239769363139553, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.2649099826812744, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6640625, + "step": 893, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0003528594970703 + }, + { + "episode": 14320, + "epoch": 0.08579885202094642, + "loss/policy_avg": -0.1013278141617775, + "lr": 9.428680981595094e-06, + "objective/entropy": -156.33245849609375, + "objective/kl": 35.587982177734375, + "objective/non_score_reward": -1.779399037361145, + "objective/rlhf_reward": -5.738993861762387, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 8.389669418334961, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.666015625, + "step": 894, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.99942946434021 + }, + { + "episode": 14336, + "epoch": 0.08589471666007598, + "loss/policy_avg": -0.006531273480504751, + "lr": 9.42804192229039e-06, + "objective/entropy": -197.26820373535156, + "objective/kl": 21.04766082763672, + "objective/non_score_reward": -1.0523829460144043, + "objective/rlhf_reward": -2.6532727172046453, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.4280903339385986, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.69140625, + "step": 895, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001786708831787 + }, + { + "episode": 14352, + "epoch": 0.08599058129920552, + "loss/policy_avg": 0.10259456932544708, + "lr": 9.427402862985686e-06, + "objective/entropy": -120.49540710449219, + "objective/kl": 37.17432403564453, + "objective/non_score_reward": -1.858716368675232, + "objective/rlhf_reward": -3.034865355491638, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.6070685386657715, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.615234375, + "step": 896, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999729871749878 + }, + { + "episode": 14368, + "epoch": 0.08608644593833507, + "loss/policy_avg": 0.18344524502754211, + "lr": 9.426763803680982e-06, + "objective/entropy": -84.0172348022461, + "objective/kl": 32.38622283935547, + "objective/non_score_reward": -1.6193112134933472, + "objective/rlhf_reward": -5.117994987700863, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 33.82829284667969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.529296875, + "step": 897, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9998817443847656 + }, + { + "episode": 14384, + "epoch": 0.08618231057746462, + "loss/policy_avg": 0.7863380312919617, + "lr": 9.42612474437628e-06, + "objective/entropy": -94.4057388305664, + "objective/kl": 31.75823402404785, + "objective/non_score_reward": -1.58791184425354, + "objective/rlhf_reward": -3.4279281839143962, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 3.406008243560791, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.8125, + "step": 898, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0038909912109375 + }, + { + "episode": 14400, + "epoch": 0.08627817521659417, + "loss/policy_avg": 0.5351603031158447, + "lr": 9.425485685071576e-06, + "objective/entropy": -265.2181396484375, + "objective/kl": 29.21182632446289, + "objective/non_score_reward": -1.460591197013855, + "objective/rlhf_reward": -4.1090314547220865, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 4.054888725280762, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.57421875, + "step": 899, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998013973236084 + }, + { + "episode": 14416, + "epoch": 0.08637403985572371, + "loss/policy_avg": 0.013534091413021088, + "lr": 9.424846625766873e-06, + "objective/entropy": -194.56564331054688, + "objective/kl": 24.970386505126953, + "objective/non_score_reward": -1.2485194206237793, + "objective/rlhf_reward": -3.6348278162225913, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 0.42985397577285767, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.53515625, + "step": 900, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002646446228027 + }, + { + "episode": 14432, + "epoch": 0.08646990449485327, + "loss/policy_avg": -0.026506464928388596, + "lr": 9.424207566462168e-06, + "objective/entropy": -121.82954406738281, + "objective/kl": 38.97528839111328, + "objective/non_score_reward": -1.9487645626068115, + "objective/rlhf_reward": -7.795057892799377, + "objective/scores": 0.0, + "policy/approxkl_avg": 18.97709846496582, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3564453125, + "step": 901, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988110065460205 + }, + { + "episode": 14448, + "epoch": 0.08656576913398281, + "loss/policy_avg": 0.04643288254737854, + "lr": 9.423568507157465e-06, + "objective/entropy": -97.38468170166016, + "objective/kl": 28.042333602905273, + "objective/non_score_reward": -1.4021167755126953, + "objective/rlhf_reward": -4.184634823997585, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 2.1407618522644043, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.60546875, + "step": 902, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999892711639404 + }, + { + "episode": 14464, + "epoch": 0.08666163377311237, + "loss/policy_avg": 0.5154027342796326, + "lr": 9.42292944785276e-06, + "objective/entropy": -250.2370147705078, + "objective/kl": 25.91543960571289, + "objective/non_score_reward": -1.2957720756530762, + "objective/rlhf_reward": -3.759255845745174, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 1.9840008020401, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.607421875, + "step": 903, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984569549560547 + }, + { + "episode": 14480, + "epoch": 0.08675749841224191, + "loss/policy_avg": -0.12090878188610077, + "lr": 9.422290388548057e-06, + "objective/entropy": -224.9342041015625, + "objective/kl": 21.860130310058594, + "objective/non_score_reward": -1.0930064916610718, + "objective/rlhf_reward": -2.42461485691541, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 6.253545761108398, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.751953125, + "step": 904, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000030755996704 + }, + { + "episode": 14496, + "epoch": 0.08685336305137147, + "loss/policy_avg": 0.2192097306251526, + "lr": 9.421651329243354e-06, + "objective/entropy": -116.75704956054688, + "objective/kl": 40.641937255859375, + "objective/non_score_reward": -2.0320968627929688, + "objective/rlhf_reward": -6.786751320868163, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 3.1222383975982666, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.525390625, + "step": 905, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990143775939941 + }, + { + "episode": 14512, + "epoch": 0.08694922769050101, + "loss/policy_avg": 0.014911421574652195, + "lr": 9.421012269938651e-06, + "objective/entropy": -169.34967041015625, + "objective/kl": 19.47471809387207, + "objective/non_score_reward": -0.9737359285354614, + "objective/rlhf_reward": -1.7722373626389838, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 3.2120165824890137, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 906, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0021252632141113 + }, + { + "episode": 14528, + "epoch": 0.08704509232963056, + "loss/policy_avg": -0.06861399114131927, + "lr": 9.420373210633948e-06, + "objective/entropy": -199.73748779296875, + "objective/kl": 32.33728790283203, + "objective/non_score_reward": -1.6168644428253174, + "objective/rlhf_reward": -5.088855722037655, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 7.329561233520508, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.58203125, + "step": 907, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.1414122581481934 + }, + { + "episode": 14544, + "epoch": 0.0871409569687601, + "loss/policy_avg": -0.0006491807289421558, + "lr": 9.419734151329245e-06, + "objective/entropy": -241.5037078857422, + "objective/kl": 26.676612854003906, + "objective/non_score_reward": -1.3338308334350586, + "objective/rlhf_reward": -0.9353229761123654, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.882882595062256, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.568359375, + "step": 908, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995675086975098 + }, + { + "episode": 14560, + "epoch": 0.08723682160788966, + "loss/policy_avg": -0.30844664573669434, + "lr": 9.41909509202454e-06, + "objective/entropy": -193.48281860351562, + "objective/kl": 32.22890853881836, + "objective/non_score_reward": -1.6114455461502075, + "objective/rlhf_reward": -4.712448493639627, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 9.586688995361328, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.537109375, + "step": 909, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0032057762145996 + }, + { + "episode": 14576, + "epoch": 0.0873326862470192, + "loss/policy_avg": 0.10456671565771103, + "lr": 9.418456032719837e-06, + "objective/entropy": -214.8862762451172, + "objective/kl": 30.845359802246094, + "objective/non_score_reward": -1.5422677993774414, + "objective/rlhf_reward": -4.769071197509765, + "objective/scores": 0.35, + "policy/approxkl_avg": 48.766883850097656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.560546875, + "step": 910, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0017874240875244 + }, + { + "episode": 14592, + "epoch": 0.08742855088614876, + "loss/policy_avg": 0.011322952806949615, + "lr": 9.417816973415134e-06, + "objective/entropy": -148.18869018554688, + "objective/kl": 34.653785705566406, + "objective/non_score_reward": -1.7326891422271729, + "objective/rlhf_reward": -5.5069247080880075, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 2.303962230682373, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.740234375, + "step": 911, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001537561416626 + }, + { + "episode": 14608, + "epoch": 0.0875244155252783, + "loss/policy_avg": 1.4446654319763184, + "lr": 9.41717791411043e-06, + "objective/entropy": -151.7039337158203, + "objective/kl": 36.139678955078125, + "objective/non_score_reward": -1.8069840669631958, + "objective/rlhf_reward": -5.623816165987568, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 8.342704772949219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 912, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997730016708374 + }, + { + "episode": 14624, + "epoch": 0.08762028016440786, + "loss/policy_avg": 0.007501431740820408, + "lr": 9.416538854805727e-06, + "objective/entropy": -192.82723999023438, + "objective/kl": 28.006526947021484, + "objective/non_score_reward": -1.4003264904022217, + "objective/rlhf_reward": -3.776477153572153, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 9.397720336914062, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.74609375, + "step": 913, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9988361597061157 + }, + { + "episode": 14640, + "epoch": 0.0877161448035374, + "loss/policy_avg": 0.7067223787307739, + "lr": 9.415899795501023e-06, + "objective/entropy": -199.13888549804688, + "objective/kl": 40.245330810546875, + "objective/non_score_reward": -2.0122666358947754, + "objective/rlhf_reward": -6.387206798017608, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 1.6032171249389648, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.626953125, + "step": 914, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0032029151916504 + }, + { + "episode": 14656, + "epoch": 0.08781200944266696, + "loss/policy_avg": 0.7447987198829651, + "lr": 9.41526073619632e-06, + "objective/entropy": -192.03024291992188, + "objective/kl": 33.84302520751953, + "objective/non_score_reward": -1.6921510696411133, + "objective/rlhf_reward": -5.212345330920771, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 12.58854866027832, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.708984375, + "step": 915, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982028007507324 + }, + { + "episode": 14672, + "epoch": 0.0879078740817965, + "loss/policy_avg": -0.12448902428150177, + "lr": 9.414621676891616e-06, + "objective/entropy": -108.39199829101562, + "objective/kl": 27.540185928344727, + "objective/non_score_reward": -1.3770092725753784, + "objective/rlhf_reward": -3.3853308580079418, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 0.6809393763542175, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.771484375, + "step": 916, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0027718544006348 + }, + { + "episode": 14688, + "epoch": 0.08800373872092605, + "loss/policy_avg": 0.09778769314289093, + "lr": 9.413982617586913e-06, + "objective/entropy": -83.20165252685547, + "objective/kl": 27.68124008178711, + "objective/non_score_reward": -1.3840619325637817, + "objective/rlhf_reward": -3.7114191010323276, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 7.001269340515137, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 917, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9994606971740723 + }, + { + "episode": 14704, + "epoch": 0.0880996033600556, + "loss/policy_avg": 0.7267050743103027, + "lr": 9.41334355828221e-06, + "objective/entropy": -174.48663330078125, + "objective/kl": 32.38935089111328, + "objective/non_score_reward": -1.6194674968719482, + "objective/rlhf_reward": -6.477869987487793, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.753436088562012, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6796875, + "step": 918, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9973618984222412 + }, + { + "episode": 14720, + "epoch": 0.08819546799918515, + "loss/policy_avg": 0.18099595606327057, + "lr": 9.412704498977507e-06, + "objective/entropy": -232.4264373779297, + "objective/kl": 37.20670700073242, + "objective/non_score_reward": -1.860335350036621, + "objective/rlhf_reward": -6.115828309088869, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 41.893341064453125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.79296875, + "step": 919, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.002889633178711 + }, + { + "episode": 14736, + "epoch": 0.08829133263831469, + "loss/policy_avg": 0.43639302253723145, + "lr": 9.412065439672802e-06, + "objective/entropy": -183.69644165039062, + "objective/kl": 24.13558006286621, + "objective/non_score_reward": -1.2067790031433105, + "objective/rlhf_reward": -2.8797047836350753, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 30.2447509765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55078125, + "step": 920, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992506504058838 + }, + { + "episode": 14752, + "epoch": 0.08838719727744425, + "loss/policy_avg": 0.5567411780357361, + "lr": 9.411426380368099e-06, + "objective/entropy": -285.06512451171875, + "objective/kl": 32.89839553833008, + "objective/non_score_reward": -1.644919753074646, + "objective/rlhf_reward": -4.8463457385698945, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 18.15423583984375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.609375, + "step": 921, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997804880142212 + }, + { + "episode": 14768, + "epoch": 0.0884830619165738, + "loss/policy_avg": -0.024971559643745422, + "lr": 9.410787321063396e-06, + "objective/entropy": -144.00473022460938, + "objective/kl": 27.061277389526367, + "objective/non_score_reward": -1.353063941001892, + "objective/rlhf_reward": -4.033653714743954, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 1.530630111694336, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 922, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0000722408294678 + }, + { + "episode": 14784, + "epoch": 0.08857892655570335, + "loss/policy_avg": -0.49618157744407654, + "lr": 9.410148261758691e-06, + "objective/entropy": -37.43824768066406, + "objective/kl": 35.81788635253906, + "objective/non_score_reward": -1.7908943891525269, + "objective/rlhf_reward": -5.501717870653259, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 4.131357192993164, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.4287109375, + "step": 923, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0173897743225098 + }, + { + "episode": 14800, + "epoch": 0.0886747911948329, + "loss/policy_avg": 0.05783979594707489, + "lr": 9.409509202453988e-06, + "objective/entropy": -154.13516235351562, + "objective/kl": 46.57448196411133, + "objective/non_score_reward": -2.3287243843078613, + "objective/rlhf_reward": -7.653037791669952, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 2.5200886726379395, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5390625, + "step": 924, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987819194793701 + }, + { + "episode": 14816, + "epoch": 0.08877065583396244, + "loss/policy_avg": 0.034926094114780426, + "lr": 9.408870143149285e-06, + "objective/entropy": -221.52577209472656, + "objective/kl": 35.47760772705078, + "objective/non_score_reward": -1.7738804817199707, + "objective/rlhf_reward": -5.614569070752024, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 1.4324332475662231, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 925, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995288848876953 + }, + { + "episode": 14832, + "epoch": 0.088866520473092, + "loss/policy_avg": 0.32427555322647095, + "lr": 9.408231083844582e-06, + "objective/entropy": -130.25445556640625, + "objective/kl": 34.63972473144531, + "objective/non_score_reward": -1.7319860458374023, + "objective/rlhf_reward": -5.371684878078058, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 3.6408345699310303, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.568359375, + "step": 926, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0012738704681396 + }, + { + "episode": 14848, + "epoch": 0.08896238511222154, + "loss/policy_avg": -0.27763280272483826, + "lr": 9.407592024539877e-06, + "objective/entropy": -244.65667724609375, + "objective/kl": 27.930646896362305, + "objective/non_score_reward": -1.396532416343689, + "objective/rlhf_reward": -3.7613009765473118, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 65.45894622802734, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.66015625, + "step": 927, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.002328872680664 + }, + { + "episode": 14864, + "epoch": 0.0890582497513511, + "loss/policy_avg": 0.39164024591445923, + "lr": 9.406952965235174e-06, + "objective/entropy": -92.6754150390625, + "objective/kl": 40.35970687866211, + "objective/non_score_reward": -2.0179853439331055, + "objective/rlhf_reward": -5.148222361446592, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.222280502319336, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.587890625, + "step": 928, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9965031147003174 + }, + { + "episode": 14880, + "epoch": 0.08915411439048064, + "loss/policy_avg": 0.018820755183696747, + "lr": 9.40631390593047e-06, + "objective/entropy": -221.75802612304688, + "objective/kl": 32.733848571777344, + "objective/non_score_reward": -1.6366922855377197, + "objective/rlhf_reward": -4.942649397913533, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 1.5601739883422852, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.67578125, + "step": 929, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0016555786132812 + }, + { + "episode": 14896, + "epoch": 0.0892499790296102, + "loss/policy_avg": 0.02956710010766983, + "lr": 9.405674846625768e-06, + "objective/entropy": -225.1991729736328, + "objective/kl": 27.00541114807129, + "objective/non_score_reward": -1.3502705097198486, + "objective/rlhf_reward": -4.059446623831421, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 6.755413055419922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8046875, + "step": 930, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001051664352417 + }, + { + "episode": 14912, + "epoch": 0.08934584366873974, + "loss/policy_avg": 0.08642945438623428, + "lr": 9.405035787321065e-06, + "objective/entropy": -179.3356475830078, + "objective/kl": 36.390193939208984, + "objective/non_score_reward": -1.8195096254348755, + "objective/rlhf_reward": -5.330627392010625, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 10.583852767944336, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4501953125, + "step": 931, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999453067779541 + }, + { + "episode": 14928, + "epoch": 0.0894417083078693, + "loss/policy_avg": 0.08466912060976028, + "lr": 9.404396728016361e-06, + "objective/entropy": -160.34024047851562, + "objective/kl": 48.99607849121094, + "objective/non_score_reward": -2.4498043060302734, + "objective/rlhf_reward": -8.195096407000142, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 0.9886335134506226, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.666015625, + "step": 932, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0025954246520996 + }, + { + "episode": 14944, + "epoch": 0.08953757294699884, + "loss/policy_avg": 0.3508598804473877, + "lr": 9.403757668711657e-06, + "objective/entropy": -177.20993041992188, + "objective/kl": 32.381324768066406, + "objective/non_score_reward": -1.6190659999847412, + "objective/rlhf_reward": -5.150751504927797, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 31.277324676513672, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4716796875, + "step": 933, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9997992515563965 + }, + { + "episode": 14960, + "epoch": 0.08963343758612839, + "loss/policy_avg": 0.11015394330024719, + "lr": 9.403118609406953e-06, + "objective/entropy": -203.39776611328125, + "objective/kl": 32.743534088134766, + "objective/non_score_reward": -1.637176752090454, + "objective/rlhf_reward": -4.94458726412447, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.4484572410583496, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.734375, + "step": 934, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9993422031402588 + }, + { + "episode": 14976, + "epoch": 0.08972930222525793, + "loss/policy_avg": -0.10944172739982605, + "lr": 9.40247955010225e-06, + "objective/entropy": -241.4989013671875, + "objective/kl": 21.90488052368164, + "objective/non_score_reward": -1.0952439308166504, + "objective/rlhf_reward": -3.0217259762033652, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 3.7654926776885986, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.689453125, + "step": 935, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.002392292022705 + }, + { + "episode": 14992, + "epoch": 0.08982516686438749, + "loss/policy_avg": 0.9405217170715332, + "lr": 9.401840490797547e-06, + "objective/entropy": -237.89816284179688, + "objective/kl": 25.436769485473633, + "objective/non_score_reward": -1.2718384265899658, + "objective/rlhf_reward": -3.663521905143825, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 6.3816022872924805, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.60546875, + "step": 936, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998328685760498 + }, + { + "episode": 15008, + "epoch": 0.08992103150351703, + "loss/policy_avg": 0.3327906131744385, + "lr": 9.401201431492844e-06, + "objective/entropy": -268.6925354003906, + "objective/kl": 37.998870849609375, + "objective/non_score_reward": -1.899943470954895, + "objective/rlhf_reward": -6.0435144593387395, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 10.16036605834961, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.607421875, + "step": 937, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984662532806396 + }, + { + "episode": 15024, + "epoch": 0.09001689614264659, + "loss/policy_avg": -0.26467132568359375, + "lr": 9.40056237218814e-06, + "objective/entropy": -231.59254455566406, + "objective/kl": 26.266529083251953, + "objective/non_score_reward": -1.3133264780044556, + "objective/rlhf_reward": -3.737534248622593, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 8.63685417175293, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6328125, + "step": 938, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999743938446045 + }, + { + "episode": 15040, + "epoch": 0.09011276078177613, + "loss/policy_avg": 0.2447420209646225, + "lr": 9.399923312883436e-06, + "objective/entropy": -278.01153564453125, + "objective/kl": 27.628671646118164, + "objective/non_score_reward": -1.3814334869384766, + "objective/rlhf_reward": -4.147132196513516, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 2.7261061668395996, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.669921875, + "step": 939, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9990365505218506 + }, + { + "episode": 15056, + "epoch": 0.09020862542090569, + "loss/policy_avg": 0.2600797414779663, + "lr": 9.399284253578733e-06, + "objective/entropy": -242.6852264404297, + "objective/kl": 40.91444396972656, + "objective/non_score_reward": -2.045722484588623, + "objective/rlhf_reward": -6.060183467642341, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.501818656921387, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 940, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971075057983398 + }, + { + "episode": 15072, + "epoch": 0.09030449006003523, + "loss/policy_avg": 0.3729836940765381, + "lr": 9.39864519427403e-06, + "objective/entropy": -225.56338500976562, + "objective/kl": 34.106658935546875, + "objective/non_score_reward": -1.7053331136703491, + "objective/rlhf_reward": -5.340379836972117, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 3.6144325733184814, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73828125, + "step": 941, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977765083312988 + }, + { + "episode": 15088, + "epoch": 0.09040035469916478, + "loss/policy_avg": 0.571183443069458, + "lr": 9.398006134969327e-06, + "objective/entropy": -109.51638793945312, + "objective/kl": 57.49871826171875, + "objective/non_score_reward": -2.8749358654022217, + "objective/rlhf_reward": -9.895623478952961, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 5.06275749206543, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.736328125, + "step": 942, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000272512435913 + }, + { + "episode": 15104, + "epoch": 0.09049621933829433, + "loss/policy_avg": 0.7253443002700806, + "lr": 9.397367075664624e-06, + "objective/entropy": -69.86570739746094, + "objective/kl": 40.12030029296875, + "objective/non_score_reward": -2.0060153007507324, + "objective/rlhf_reward": -6.362201397836792, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 66.08172607421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 943, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997114896774292 + }, + { + "episode": 15120, + "epoch": 0.09059208397742388, + "loss/policy_avg": 0.7548943758010864, + "lr": 9.396728016359919e-06, + "objective/entropy": -264.1029357910156, + "objective/kl": 29.125934600830078, + "objective/non_score_reward": -1.456296682357788, + "objective/rlhf_reward": -4.268927424159601, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.555539846420288, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 944, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0007224082946777 + }, + { + "episode": 15136, + "epoch": 0.09068794861655342, + "loss/policy_avg": -0.06224450469017029, + "lr": 9.396088957055216e-06, + "objective/entropy": -215.80255126953125, + "objective/kl": 36.1290283203125, + "objective/non_score_reward": -1.8064515590667725, + "objective/rlhf_reward": -5.7100345728718604, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 2.062628746032715, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.5703125, + "step": 945, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0022828578948975 + }, + { + "episode": 15152, + "epoch": 0.09078381325568298, + "loss/policy_avg": -0.34320878982543945, + "lr": 9.395449897750511e-06, + "objective/entropy": -254.14260864257812, + "objective/kl": 24.163818359375, + "objective/non_score_reward": -1.20819091796875, + "objective/rlhf_reward": -2.4327639102935787, + "objective/scores": 0.6, + "policy/approxkl_avg": 3.011139392852783, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.62109375, + "step": 946, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0023422241210938 + }, + { + "episode": 15168, + "epoch": 0.09087967789481252, + "loss/policy_avg": 0.08071097731590271, + "lr": 9.394810838445808e-06, + "objective/entropy": -269.91180419921875, + "objective/kl": 29.857431411743164, + "objective/non_score_reward": -1.4928715229034424, + "objective/rlhf_reward": -3.8487801573434215, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.305149555206299, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 947, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971578121185303 + }, + { + "episode": 15184, + "epoch": 0.09097554253394208, + "loss/policy_avg": -0.019624732434749603, + "lr": 9.394171779141105e-06, + "objective/entropy": -274.10198974609375, + "objective/kl": 33.219993591308594, + "objective/non_score_reward": -1.6609996557235718, + "objective/rlhf_reward": -6.643998503684998, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.708046913146973, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 948, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000514030456543 + }, + { + "episode": 15200, + "epoch": 0.09107140717307162, + "loss/policy_avg": -0.5435956716537476, + "lr": 9.393532719836402e-06, + "objective/entropy": -245.58270263671875, + "objective/kl": 26.876476287841797, + "objective/non_score_reward": -1.3438239097595215, + "objective/rlhf_reward": -3.771175924603062, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 10.178674697875977, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.630859375, + "step": 949, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0033979415893555 + }, + { + "episode": 15216, + "epoch": 0.09116727181220118, + "loss/policy_avg": 0.6083466410636902, + "lr": 9.392893660531698e-06, + "objective/entropy": -169.32357788085938, + "objective/kl": 38.449127197265625, + "objective/non_score_reward": -1.9224563837051392, + "objective/rlhf_reward": -6.133565931525782, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 8.572129249572754, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.685546875, + "step": 950, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000035285949707 + }, + { + "episode": 15232, + "epoch": 0.09126313645133072, + "loss/policy_avg": 0.1515914499759674, + "lr": 9.392254601226994e-06, + "objective/entropy": -181.75010681152344, + "objective/kl": 31.95659637451172, + "objective/non_score_reward": -1.5978299379348755, + "objective/rlhf_reward": -5.04968385985437, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 12.761173248291016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.798828125, + "step": 951, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977487325668335 + }, + { + "episode": 15248, + "epoch": 0.09135900109046027, + "loss/policy_avg": 0.7638048529624939, + "lr": 9.39161554192229e-06, + "objective/entropy": -158.99050903320312, + "objective/kl": 39.69103240966797, + "objective/non_score_reward": -1.9845517873764038, + "objective/rlhf_reward": -5.815500917212043, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 11.06544303894043, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.541015625, + "step": 952, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9981579780578613 + }, + { + "episode": 15264, + "epoch": 0.09145486572958982, + "loss/policy_avg": 0.764492392539978, + "lr": 9.390976482617587e-06, + "objective/entropy": -159.26947021484375, + "objective/kl": 28.415475845336914, + "objective/non_score_reward": -1.4207737445831299, + "objective/rlhf_reward": -5.683095276355743, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.907594680786133, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6328125, + "step": 953, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997883677482605 + }, + { + "episode": 15280, + "epoch": 0.09155073036871937, + "loss/policy_avg": 0.3368009328842163, + "lr": 9.390337423312884e-06, + "objective/entropy": -173.85415649414062, + "objective/kl": 35.513309478759766, + "objective/non_score_reward": -1.775665521621704, + "objective/rlhf_reward": -5.49854234224947, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 6.337751388549805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.541015625, + "step": 954, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994667768478394 + }, + { + "episode": 15296, + "epoch": 0.09164659500784891, + "loss/policy_avg": 0.0456845797598362, + "lr": 9.389698364008181e-06, + "objective/entropy": 16.725250244140625, + "objective/kl": 36.44686508178711, + "objective/non_score_reward": -1.822343349456787, + "objective/rlhf_reward": -5.865541179378596, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 21.832763671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.548828125, + "step": 955, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999651551246643 + }, + { + "episode": 15312, + "epoch": 0.09174245964697847, + "loss/policy_avg": 0.0268879272043705, + "lr": 9.389059304703478e-06, + "objective/entropy": -219.0832977294922, + "objective/kl": 25.021286010742188, + "objective/non_score_reward": -1.2510643005371094, + "objective/rlhf_reward": -3.1794285133209934, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 13.525361061096191, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.673828125, + "step": 956, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0001580715179443 + }, + { + "episode": 15328, + "epoch": 0.09183832428610801, + "loss/policy_avg": 0.25198429822921753, + "lr": 9.388420245398773e-06, + "objective/entropy": -216.4515838623047, + "objective/kl": 29.98337173461914, + "objective/non_score_reward": -1.4991683959960938, + "objective/rlhf_reward": -3.0729548081171245, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 8.199630737304688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 957, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9965362548828125 + }, + { + "episode": 15344, + "epoch": 0.09193418892523757, + "loss/policy_avg": 0.035516731441020966, + "lr": 9.38778118609407e-06, + "objective/entropy": -250.8704833984375, + "objective/kl": 30.556961059570312, + "objective/non_score_reward": -1.5278480052947998, + "objective/rlhf_reward": -4.73278991231094, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 3.100607395172119, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.607421875, + "step": 958, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0005221366882324 + }, + { + "episode": 15360, + "epoch": 0.09203005356436711, + "loss/policy_avg": 0.6594608426094055, + "lr": 9.387142126789367e-06, + "objective/entropy": -190.2021942138672, + "objective/kl": 29.693756103515625, + "objective/non_score_reward": -1.4846878051757812, + "objective/rlhf_reward": -4.38249173661764, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 11.999906539916992, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55859375, + "step": 959, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9964005947113037 + }, + { + "episode": 15376, + "epoch": 0.09212591820349667, + "loss/policy_avg": 0.16847842931747437, + "lr": 9.386503067484664e-06, + "objective/entropy": -220.72311401367188, + "objective/kl": 22.618806838989258, + "objective/non_score_reward": -1.1309404373168945, + "objective/rlhf_reward": -3.0731633110955805, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 1.5775080919265747, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.63671875, + "step": 960, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0047149658203125 + }, + { + "episode": 15392, + "epoch": 0.09222178284262621, + "loss/policy_avg": 0.37361010909080505, + "lr": 9.38586400817996e-06, + "objective/entropy": -219.60760498046875, + "objective/kl": 31.668062210083008, + "objective/non_score_reward": -1.58340322971344, + "objective/rlhf_reward": -4.852660181935191, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 6.965027809143066, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.615234375, + "step": 961, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9987406730651855 + }, + { + "episode": 15408, + "epoch": 0.09231764748175576, + "loss/policy_avg": 0.3272181749343872, + "lr": 9.385224948875256e-06, + "objective/entropy": -200.26370239257812, + "objective/kl": 38.33747100830078, + "objective/non_score_reward": -1.916873574256897, + "objective/rlhf_reward": -5.720082710461552, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.9499969482421875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.583984375, + "step": 962, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983041286468506 + }, + { + "episode": 15424, + "epoch": 0.0924135121208853, + "loss/policy_avg": 0.02453005313873291, + "lr": 9.384585889570553e-06, + "objective/entropy": -259.0159606933594, + "objective/kl": 32.376686096191406, + "objective/non_score_reward": -1.6188342571258545, + "objective/rlhf_reward": -5.051504810054866, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 11.491250038146973, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.634765625, + "step": 963, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0016989707946777 + }, + { + "episode": 15440, + "epoch": 0.09250937676001486, + "loss/policy_avg": -0.1082817018032074, + "lr": 9.38394683026585e-06, + "objective/entropy": -136.52200317382812, + "objective/kl": 34.37030792236328, + "objective/non_score_reward": -1.718515396118164, + "objective/rlhf_reward": -5.212202077329742, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 5.610563278198242, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65234375, + "step": 964, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993445873260498 + }, + { + "episode": 15456, + "epoch": 0.0926052413991444, + "loss/policy_avg": 0.3635658025741577, + "lr": 9.383307770961147e-06, + "objective/entropy": -242.04705810546875, + "objective/kl": 26.167871475219727, + "objective/non_score_reward": -1.3083934783935547, + "objective/rlhf_reward": -3.8335740923881527, + "objective/scores": 0.35, + "policy/approxkl_avg": 10.497917175292969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.505859375, + "step": 965, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998986840248108 + }, + { + "episode": 15472, + "epoch": 0.09270110603827396, + "loss/policy_avg": 0.4805383086204529, + "lr": 9.382668711656443e-06, + "objective/entropy": -130.80931091308594, + "objective/kl": 43.840057373046875, + "objective/non_score_reward": -2.192002773284912, + "objective/rlhf_reward": -6.368011450767517, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.2675271034240723, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.619140625, + "step": 966, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001443862915039 + }, + { + "episode": 15488, + "epoch": 0.0927969706774035, + "loss/policy_avg": 0.9434456825256348, + "lr": 9.382029652351739e-06, + "objective/entropy": -116.85310363769531, + "objective/kl": 55.79869842529297, + "objective/non_score_reward": -2.7899351119995117, + "objective/rlhf_reward": -9.426406518618265, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 2.6991868019104004, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.791015625, + "step": 967, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00309157371521 + }, + { + "episode": 15504, + "epoch": 0.09289283531653306, + "loss/policy_avg": 0.2830507755279541, + "lr": 9.381390593047035e-06, + "objective/entropy": -260.5260925292969, + "objective/kl": 34.16276550292969, + "objective/non_score_reward": -1.7081382274627686, + "objective/rlhf_reward": -5.381954531283721, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 4.792706489562988, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.615234375, + "step": 968, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9985530376434326 + }, + { + "episode": 15520, + "epoch": 0.0929886999556626, + "loss/policy_avg": 0.19756931066513062, + "lr": 9.380751533742332e-06, + "objective/entropy": -234.741455078125, + "objective/kl": 25.891204833984375, + "objective/non_score_reward": -1.2945603132247925, + "objective/rlhf_reward": -3.055534782187019, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.262695789337158, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 969, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0007271766662598 + }, + { + "episode": 15536, + "epoch": 0.09308456459479215, + "loss/policy_avg": 0.0513734444975853, + "lr": 9.380112474437628e-06, + "objective/entropy": -195.60171508789062, + "objective/kl": 35.50217819213867, + "objective/non_score_reward": -1.775109052658081, + "objective/rlhf_reward": -5.741186105941219, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.6989755630493164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.607421875, + "step": 970, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 2.0012567043304443 + }, + { + "episode": 15552, + "epoch": 0.0931804292339217, + "loss/policy_avg": 0.1513216644525528, + "lr": 9.379473415132924e-06, + "objective/entropy": -245.57977294921875, + "objective/kl": 23.89773941040039, + "objective/non_score_reward": -1.1948869228363037, + "objective/rlhf_reward": -4.7795480489730835, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.129580020904541, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69921875, + "step": 971, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000563621520996 + }, + { + "episode": 15568, + "epoch": 0.09327629387305125, + "loss/policy_avg": 0.041885554790496826, + "lr": 9.378834355828221e-06, + "objective/entropy": -261.82769775390625, + "objective/kl": 24.18181037902832, + "objective/non_score_reward": -1.2090904712677002, + "objective/rlhf_reward": -3.457759955016476, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 9.62070369720459, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.533203125, + "step": 972, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9969148635864258 + }, + { + "episode": 15584, + "epoch": 0.0933721585121808, + "loss/policy_avg": 0.012015002779662609, + "lr": 9.378195296523518e-06, + "objective/entropy": -251.767333984375, + "objective/kl": 27.563173294067383, + "objective/non_score_reward": -1.378158688545227, + "objective/rlhf_reward": -3.908514711920338, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 1.0967427492141724, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.634765625, + "step": 973, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0006260871887207 + }, + { + "episode": 15600, + "epoch": 0.09346802315131035, + "loss/policy_avg": -0.31819072365760803, + "lr": 9.377556237218815e-06, + "objective/entropy": -175.70556640625, + "objective/kl": 28.285152435302734, + "objective/non_score_reward": -1.4142576456069946, + "objective/rlhf_reward": -4.052910540167408, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.37001371383667, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.505859375, + "step": 974, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9995312690734863 + }, + { + "episode": 15616, + "epoch": 0.09356388779043989, + "loss/policy_avg": 0.6060304641723633, + "lr": 9.37691717791411e-06, + "objective/entropy": -34.974281311035156, + "objective/kl": 35.56610107421875, + "objective/non_score_reward": -1.7783050537109375, + "objective/rlhf_reward": -5.59744867065781, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 6.845120906829834, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.853515625, + "step": 975, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995267391204834 + }, + { + "episode": 15632, + "epoch": 0.09365975242956945, + "loss/policy_avg": 0.1691616326570511, + "lr": 9.376278118609407e-06, + "objective/entropy": -173.51535034179688, + "objective/kl": 40.181976318359375, + "objective/non_score_reward": -2.009099006652832, + "objective/rlhf_reward": -6.657793619719845, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 0.46673262119293213, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71875, + "step": 976, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0020508766174316 + }, + { + "episode": 15648, + "epoch": 0.09375561706869899, + "loss/policy_avg": 0.12263473123311996, + "lr": 9.375639059304704e-06, + "objective/entropy": -244.26974487304688, + "objective/kl": 29.573442459106445, + "objective/non_score_reward": -1.4786722660064697, + "objective/rlhf_reward": -4.358429758754328, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 3.748386859893799, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.693359375, + "step": 977, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9990627765655518 + }, + { + "episode": 15664, + "epoch": 0.09385148170782855, + "loss/policy_avg": 1.4557695388793945, + "lr": 9.375000000000001e-06, + "objective/entropy": -133.55853271484375, + "objective/kl": 45.2318229675293, + "objective/non_score_reward": -2.2615909576416016, + "objective/rlhf_reward": -7.530592167171177, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 4.7986626625061035, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 978, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999971389770508 + }, + { + "episode": 15680, + "epoch": 0.09394734634695809, + "loss/policy_avg": 0.04724450409412384, + "lr": 9.374360940695298e-06, + "objective/entropy": -291.25103759765625, + "objective/kl": 28.29153823852539, + "objective/non_score_reward": -1.4145770072937012, + "objective/rlhf_reward": -3.710896800236638, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.313387393951416, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 979, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993375539779663 + }, + { + "episode": 15696, + "epoch": 0.09404321098608764, + "loss/policy_avg": 0.2293320745229721, + "lr": 9.373721881390595e-06, + "objective/entropy": -136.44857788085938, + "objective/kl": 38.36898422241211, + "objective/non_score_reward": -1.9184492826461792, + "objective/rlhf_reward": -5.551090779081855, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.303453207015991, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.61328125, + "step": 980, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999606966972351 + }, + { + "episode": 15712, + "epoch": 0.0941390756252172, + "loss/policy_avg": 0.16989938914775848, + "lr": 9.37308282208589e-06, + "objective/entropy": -171.79864501953125, + "objective/kl": 32.806495666503906, + "objective/non_score_reward": -1.640324592590332, + "objective/rlhf_reward": -4.613887022213872, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 8.31067180633545, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.603515625, + "step": 981, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984290599822998 + }, + { + "episode": 15728, + "epoch": 0.09423494026434674, + "loss/policy_avg": 0.7234645485877991, + "lr": 9.372443762781187e-06, + "objective/entropy": -219.93374633789062, + "objective/kl": 26.91738510131836, + "objective/non_score_reward": -1.3458693027496338, + "objective/rlhf_reward": -0.9834773302078244, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.4521507024765015, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.646484375, + "step": 982, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.003150463104248 + }, + { + "episode": 15744, + "epoch": 0.0943308049034763, + "loss/policy_avg": 0.48133015632629395, + "lr": 9.371804703476484e-06, + "objective/entropy": -282.47552490234375, + "objective/kl": 39.29179763793945, + "objective/non_score_reward": -1.9645898342132568, + "objective/rlhf_reward": -6.125026241938272, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 6.169063568115234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.720703125, + "step": 983, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997374415397644 + }, + { + "episode": 15760, + "epoch": 0.09442666954260584, + "loss/policy_avg": 0.1187177523970604, + "lr": 9.37116564417178e-06, + "objective/entropy": -158.33642578125, + "objective/kl": 40.20547103881836, + "objective/non_score_reward": -2.0102736949920654, + "objective/rlhf_reward": -6.69945864966455, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 3.5165886878967285, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 984, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9981753826141357 + }, + { + "episode": 15776, + "epoch": 0.0945225341817354, + "loss/policy_avg": 0.16677279770374298, + "lr": 9.370526584867077e-06, + "objective/entropy": -162.21728515625, + "objective/kl": 33.61964797973633, + "objective/non_score_reward": -1.6809823513031006, + "objective/rlhf_reward": -5.323929286003112, + "objective/scores": 0.35, + "policy/approxkl_avg": 5.913999557495117, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.734375, + "step": 985, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9968831539154053 + }, + { + "episode": 15792, + "epoch": 0.09461839882086494, + "loss/policy_avg": 0.22338780760765076, + "lr": 9.369887525562373e-06, + "objective/entropy": -191.39588928222656, + "objective/kl": 50.39151382446289, + "objective/non_score_reward": -2.519575595855713, + "objective/rlhf_reward": -8.416443472326385, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 45.444732666015625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.755859375, + "step": 986, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998542308807373 + }, + { + "episode": 15808, + "epoch": 0.0947142634599945, + "loss/policy_avg": 0.37791919708251953, + "lr": 9.36924846625767e-06, + "objective/entropy": -270.806396484375, + "objective/kl": 29.205078125, + "objective/non_score_reward": -1.4602539539337158, + "objective/rlhf_reward": -5.841015696525574, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.895004272460938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6875, + "step": 987, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979805946350098 + }, + { + "episode": 15824, + "epoch": 0.09481012809912404, + "loss/policy_avg": 0.7314577102661133, + "lr": 9.368609406952966e-06, + "objective/entropy": -174.33633422851562, + "objective/kl": 41.00555419921875, + "objective/non_score_reward": -2.0502774715423584, + "objective/rlhf_reward": -6.77727790613946, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 4.151052474975586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 988, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998262882232666 + }, + { + "episode": 15840, + "epoch": 0.09490599273825359, + "loss/policy_avg": 0.1200692355632782, + "lr": 9.367970347648263e-06, + "objective/entropy": -259.9232177734375, + "objective/kl": 32.56160354614258, + "objective/non_score_reward": -1.628080129623413, + "objective/rlhf_reward": -5.112320518493652, + "objective/scores": 0.35, + "policy/approxkl_avg": 3.3896703720092773, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.615234375, + "step": 989, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0001492500305176 + }, + { + "episode": 15856, + "epoch": 0.09500185737738313, + "loss/policy_avg": 0.7871278524398804, + "lr": 9.367331288343558e-06, + "objective/entropy": -162.90664672851562, + "objective/kl": 37.55353927612305, + "objective/non_score_reward": -1.8776767253875732, + "objective/rlhf_reward": -6.086875279148188, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 24.93891716003418, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7890625, + "step": 990, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9959328174591064 + }, + { + "episode": 15872, + "epoch": 0.09509772201651269, + "loss/policy_avg": -0.12516134977340698, + "lr": 9.366692229038855e-06, + "objective/entropy": -238.83116149902344, + "objective/kl": 37.03616714477539, + "objective/non_score_reward": -1.8518084287643433, + "objective/rlhf_reward": -6.047983967994137, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 15.576482772827148, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.681640625, + "step": 991, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985809326171875 + }, + { + "episode": 15888, + "epoch": 0.09519358665564223, + "loss/policy_avg": -0.04968651384115219, + "lr": 9.366053169734152e-06, + "objective/entropy": -183.43231201171875, + "objective/kl": 35.40851593017578, + "objective/non_score_reward": -1.77042555809021, + "objective/rlhf_reward": -5.756189737349672, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 0.5774535536766052, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.583984375, + "step": 992, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002680778503418 + }, + { + "episode": 15904, + "epoch": 0.09528945129477179, + "loss/policy_avg": 0.009859908372163773, + "lr": 9.365414110429449e-06, + "objective/entropy": -14.670166015625, + "objective/kl": 53.70581817626953, + "objective/non_score_reward": -2.685290813446045, + "objective/rlhf_reward": -8.3411630153656, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.3184102773666382, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 993, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0005533695220947 + }, + { + "episode": 15920, + "epoch": 0.09538531593390133, + "loss/policy_avg": 0.3695295453071594, + "lr": 9.364775051124744e-06, + "objective/entropy": -288.468505859375, + "objective/kl": 32.96984100341797, + "objective/non_score_reward": -1.6484923362731934, + "objective/rlhf_reward": -5.0781975624882545, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 3.1653892993927, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.642578125, + "step": 994, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999734878540039 + }, + { + "episode": 15936, + "epoch": 0.09548118057303089, + "loss/policy_avg": 0.3992432951927185, + "lr": 9.364135991820041e-06, + "objective/entropy": -231.646728515625, + "objective/kl": 34.67195510864258, + "objective/non_score_reward": -1.733597755432129, + "objective/rlhf_reward": -5.510559280117121, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 19.767539978027344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.849609375, + "step": 995, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9987037181854248 + }, + { + "episode": 15952, + "epoch": 0.09557704521216043, + "loss/policy_avg": 0.03356311097741127, + "lr": 9.363496932515338e-06, + "objective/entropy": -210.72410583496094, + "objective/kl": 27.1010799407959, + "objective/non_score_reward": -1.3550540208816528, + "objective/rlhf_reward": -3.595387215885233, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 1.0958271026611328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7265625, + "step": 996, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9994826316833496 + }, + { + "episode": 15968, + "epoch": 0.09567290985128998, + "loss/policy_avg": 1.1218140125274658, + "lr": 9.362857873210635e-06, + "objective/entropy": -71.63316345214844, + "objective/kl": 40.19666290283203, + "objective/non_score_reward": -2.009833335876465, + "objective/rlhf_reward": -8.03933310508728, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4838500022888184, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.66015625, + "step": 997, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991953372955322 + }, + { + "episode": 15984, + "epoch": 0.09576877449041953, + "loss/policy_avg": 0.23440885543823242, + "lr": 9.362218813905932e-06, + "objective/entropy": -217.69229125976562, + "objective/kl": 26.445728302001953, + "objective/non_score_reward": -1.3222863674163818, + "objective/rlhf_reward": -3.773374044688877, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 11.445338249206543, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.623046875, + "step": 998, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9970027208328247 + }, + { + "episode": 16000, + "epoch": 0.09586463912954908, + "loss/policy_avg": -0.3169388175010681, + "lr": 9.361579754601227e-06, + "objective/entropy": -116.28077697753906, + "objective/kl": 44.722564697265625, + "objective/non_score_reward": -2.236128091812134, + "objective/rlhf_reward": -6.997101019101079, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.412589073181152, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.734375, + "step": 999, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000357151031494 + }, + { + "episode": 16016, + "epoch": 0.09596050376867862, + "loss/policy_avg": 0.49583154916763306, + "lr": 9.360940695296524e-06, + "objective/entropy": -255.0631561279297, + "objective/kl": 37.207157135009766, + "objective/non_score_reward": -1.8603577613830566, + "objective/rlhf_reward": -4.517711792827818, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 4.7410383224487305, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 1000, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9983508586883545 + }, + { + "episode": 16032, + "epoch": 0.09605636840780818, + "loss/policy_avg": 0.2908029556274414, + "lr": 9.36030163599182e-06, + "objective/entropy": -158.05224609375, + "objective/kl": 43.559486389160156, + "objective/non_score_reward": -2.177974224090576, + "objective/rlhf_reward": -7.386384520560426, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 4.997418403625488, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49609375, + "step": 1001, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000509738922119 + }, + { + "episode": 16048, + "epoch": 0.09615223304693772, + "loss/policy_avg": 0.0880887508392334, + "lr": 9.359662576687117e-06, + "objective/entropy": -159.17636108398438, + "objective/kl": 32.491432189941406, + "objective/non_score_reward": -1.6245718002319336, + "objective/rlhf_reward": -5.1196852708734095, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 23.146318435668945, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.615234375, + "step": 1002, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992117881774902 + }, + { + "episode": 16064, + "epoch": 0.09624809768606728, + "loss/policy_avg": -0.1608562171459198, + "lr": 9.359023517382414e-06, + "objective/entropy": 31.09607696533203, + "objective/kl": 48.06477355957031, + "objective/non_score_reward": -2.4032387733459473, + "objective/rlhf_reward": -7.78812610653312, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 4.198085784912109, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.71484375, + "step": 1003, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001403331756592 + }, + { + "episode": 16080, + "epoch": 0.09634396232519682, + "loss/policy_avg": -0.09791003167629242, + "lr": 9.358384458077711e-06, + "objective/entropy": -204.42648315429688, + "objective/kl": 32.63614273071289, + "objective/non_score_reward": -1.6318070888519287, + "objective/rlhf_reward": -5.127228474617004, + "objective/scores": 0.35, + "policy/approxkl_avg": 3.644939422607422, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.623046875, + "step": 1004, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0014867782592773 + }, + { + "episode": 16096, + "epoch": 0.09643982696432638, + "loss/policy_avg": 0.3904947340488434, + "lr": 9.357745398773006e-06, + "objective/entropy": -230.99227905273438, + "objective/kl": 26.775943756103516, + "objective/non_score_reward": -1.3387972116470337, + "objective/rlhf_reward": -3.955188965797424, + "objective/scores": 0.35, + "policy/approxkl_avg": 6.282003402709961, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.859375, + "step": 1005, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 12, + "val/ratio": 1.999232292175293 + }, + { + "episode": 16112, + "epoch": 0.09653569160345592, + "loss/policy_avg": 0.7725321054458618, + "lr": 9.357106339468303e-06, + "objective/entropy": -164.7260284423828, + "objective/kl": 36.20423889160156, + "objective/non_score_reward": -1.8102120161056519, + "objective/rlhf_reward": -5.416019315990518, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 2.2319459915161133, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69921875, + "step": 1006, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000358819961548 + }, + { + "episode": 16128, + "epoch": 0.09663155624258547, + "loss/policy_avg": 0.4622969627380371, + "lr": 9.3564672801636e-06, + "objective/entropy": -133.11448669433594, + "objective/kl": 46.60032272338867, + "objective/non_score_reward": -2.3300158977508545, + "objective/rlhf_reward": -7.920063829421997, + "objective/scores": 0.35, + "policy/approxkl_avg": 4.947162628173828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.63671875, + "step": 1007, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9968760013580322 + }, + { + "episode": 16144, + "epoch": 0.09672742088171501, + "loss/policy_avg": 0.28032606840133667, + "lr": 9.355828220858897e-06, + "objective/entropy": -185.09371948242188, + "objective/kl": 38.272674560546875, + "objective/non_score_reward": -1.9136335849761963, + "objective/rlhf_reward": -6.275932290641171, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 5.263652801513672, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.650390625, + "step": 1008, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0010600090026855 + }, + { + "episode": 16160, + "epoch": 0.09682328552084457, + "loss/policy_avg": 0.18294349312782288, + "lr": 9.355189161554194e-06, + "objective/entropy": -147.19964599609375, + "objective/kl": 32.98589324951172, + "objective/non_score_reward": -1.6492946147918701, + "objective/rlhf_reward": -4.935319071233856, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 5.73829460144043, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.67578125, + "step": 1009, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979850053787231 + }, + { + "episode": 16176, + "epoch": 0.09691915015997411, + "loss/policy_avg": -0.004333788529038429, + "lr": 9.35455010224949e-06, + "objective/entropy": -197.96774291992188, + "objective/kl": 37.333194732666016, + "objective/non_score_reward": -1.8666596412658691, + "objective/rlhf_reward": -4.5429196699869365, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 1.3020401000976562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8203125, + "step": 1010, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9998223781585693 + }, + { + "episode": 16192, + "epoch": 0.09701501479910367, + "loss/policy_avg": -0.052329957485198975, + "lr": 9.353911042944786e-06, + "objective/entropy": -197.37957763671875, + "objective/kl": 30.12477684020996, + "objective/non_score_reward": -1.5062386989593506, + "objective/rlhf_reward": -4.077543805317815, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.2824825048446655, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.619140625, + "step": 1011, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000406265258789 + }, + { + "episode": 16208, + "epoch": 0.09711087943823321, + "loss/policy_avg": -0.058374106884002686, + "lr": 9.353271983640083e-06, + "objective/entropy": -196.46224975585938, + "objective/kl": 28.03622817993164, + "objective/non_score_reward": -1.4018113613128662, + "objective/rlhf_reward": -4.126293065960764, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 1.1209321022033691, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.57421875, + "step": 1012, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.001298427581787 + }, + { + "episode": 16224, + "epoch": 0.09720674407736277, + "loss/policy_avg": 0.7006990909576416, + "lr": 9.352632924335378e-06, + "objective/entropy": -285.3323974609375, + "objective/kl": 28.77189826965332, + "objective/non_score_reward": -1.4385948181152344, + "objective/rlhf_reward": -4.4288665390311905, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 2.1271591186523438, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.693359375, + "step": 1013, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.002638339996338 + }, + { + "episode": 16240, + "epoch": 0.09730260871649231, + "loss/policy_avg": 0.07051658630371094, + "lr": 9.351993865030675e-06, + "objective/entropy": -198.2432098388672, + "objective/kl": 24.557363510131836, + "objective/non_score_reward": -1.2278680801391602, + "objective/rlhf_reward": -3.5859598255454728, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 29.07752227783203, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 1014, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990439414978027 + }, + { + "episode": 16256, + "epoch": 0.09739847335562186, + "loss/policy_avg": -0.5166081190109253, + "lr": 9.351354805725972e-06, + "objective/entropy": -63.29674530029297, + "objective/kl": 38.85722351074219, + "objective/non_score_reward": -1.9428613185882568, + "objective/rlhf_reward": -6.167325112883168, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 6.718572616577148, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.61328125, + "step": 1015, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0040817260742188 + }, + { + "episode": 16272, + "epoch": 0.0974943379947514, + "loss/policy_avg": 0.462972491979599, + "lr": 9.350715746421269e-06, + "objective/entropy": -214.515380859375, + "objective/kl": 33.796573638916016, + "objective/non_score_reward": -1.689828634262085, + "objective/rlhf_reward": -5.203055350986078, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 9.360330581665039, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.79296875, + "step": 1016, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.996612310409546 + }, + { + "episode": 16288, + "epoch": 0.09759020263388096, + "loss/policy_avg": -0.1453489363193512, + "lr": 9.350076687116566e-06, + "objective/entropy": -235.11651611328125, + "objective/kl": 33.26921081542969, + "objective/non_score_reward": -1.663460612297058, + "objective/rlhf_reward": -4.7064312202500656, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.160917282104492, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.703125, + "step": 1017, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000718593597412 + }, + { + "episode": 16304, + "epoch": 0.0976860672730105, + "loss/policy_avg": 0.19937211275100708, + "lr": 9.34943762781186e-06, + "objective/entropy": -255.98963928222656, + "objective/kl": 37.99565887451172, + "objective/non_score_reward": -1.8997828960418701, + "objective/rlhf_reward": -6.257495990305571, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 18.184246063232422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58203125, + "step": 1018, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998490810394287 + }, + { + "episode": 16320, + "epoch": 0.09778193191214006, + "loss/policy_avg": -0.04537857323884964, + "lr": 9.348798568507158e-06, + "objective/entropy": -208.28750610351562, + "objective/kl": 29.751262664794922, + "objective/non_score_reward": -1.4875633716583252, + "objective/rlhf_reward": -4.002841900067265, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 35.739540100097656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.787109375, + "step": 1019, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9980144500732422 + }, + { + "episode": 16336, + "epoch": 0.0978777965512696, + "loss/policy_avg": 0.15292394161224365, + "lr": 9.348159509202455e-06, + "objective/entropy": -234.64700317382812, + "objective/kl": 29.85890769958496, + "objective/non_score_reward": -1.4929454326629639, + "objective/rlhf_reward": -4.3676616287866405, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.772150993347168, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.677734375, + "step": 1020, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0000951290130615 + }, + { + "episode": 16352, + "epoch": 0.09797366119039916, + "loss/policy_avg": 0.3814322352409363, + "lr": 9.347520449897751e-06, + "objective/entropy": -124.42337799072266, + "objective/kl": 36.442901611328125, + "objective/non_score_reward": -1.8221449851989746, + "objective/rlhf_reward": -5.8885798215866085, + "objective/scores": 0.35, + "policy/approxkl_avg": 5.533565998077393, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 1021, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9961100816726685 + }, + { + "episode": 16368, + "epoch": 0.0980695258295287, + "loss/policy_avg": 0.4999345541000366, + "lr": 9.346881390593048e-06, + "objective/entropy": -192.25704956054688, + "objective/kl": 24.090442657470703, + "objective/non_score_reward": -1.2045221328735352, + "objective/rlhf_reward": -3.3023168084942665, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 16.40319061279297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.556640625, + "step": 1022, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997682809829712 + }, + { + "episode": 16384, + "epoch": 0.09816539046865826, + "loss/policy_avg": 0.22556136548519135, + "lr": 9.346242331288345e-06, + "objective/entropy": -280.6515197753906, + "objective/kl": 30.555099487304688, + "objective/non_score_reward": -1.5277550220489502, + "objective/rlhf_reward": -4.506900105539875, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 0.8321056365966797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.650390625, + "step": 1023, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9995068311691284 + }, + { + "episode": 16400, + "epoch": 0.0982612551077878, + "loss/policy_avg": 0.1927730292081833, + "lr": 9.34560327198364e-06, + "objective/entropy": -114.62777709960938, + "objective/kl": 41.009063720703125, + "objective/non_score_reward": -2.0504534244537354, + "objective/rlhf_reward": -6.5399538926488034, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 12.904714584350586, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.55859375, + "step": 1024, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9970085620880127 + }, + { + "episode": 16416, + "epoch": 0.09835711974691735, + "loss/policy_avg": 0.004962563514709473, + "lr": 9.344964212678937e-06, + "objective/entropy": -175.405029296875, + "objective/kl": 32.8451042175293, + "objective/non_score_reward": -1.6422550678253174, + "objective/rlhf_reward": -4.835687295595805, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 2.176795244216919, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.763671875, + "step": 1025, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9991182088851929 + }, + { + "episode": 16432, + "epoch": 0.0984529843860469, + "loss/policy_avg": 0.3356385827064514, + "lr": 9.344325153374234e-06, + "objective/entropy": -179.56375122070312, + "objective/kl": 44.559669494628906, + "objective/non_score_reward": -2.2279834747314453, + "objective/rlhf_reward": -7.1786006848017365, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.5793884992599487, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6875, + "step": 1026, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000429153442383 + }, + { + "episode": 16448, + "epoch": 0.09854884902517645, + "loss/policy_avg": 0.062264252454042435, + "lr": 9.343686094069531e-06, + "objective/entropy": -124.67230224609375, + "objective/kl": 32.24571228027344, + "objective/non_score_reward": -1.6122857332229614, + "objective/rlhf_reward": -4.049142932891845, + "objective/scores": 0.6, + "policy/approxkl_avg": 4.209178924560547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.68359375, + "step": 1027, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998481273651123 + }, + { + "episode": 16464, + "epoch": 0.098644713664306, + "loss/policy_avg": 0.27750128507614136, + "lr": 9.343047034764828e-06, + "objective/entropy": -280.3656005859375, + "objective/kl": 36.0235710144043, + "objective/non_score_reward": -1.8011784553527832, + "objective/rlhf_reward": -5.863078525572448, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 9.040508270263672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 1028, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9958012104034424 + }, + { + "episode": 16480, + "epoch": 0.09874057830343555, + "loss/policy_avg": -0.08439403772354126, + "lr": 9.342407975460123e-06, + "objective/entropy": -159.83497619628906, + "objective/kl": 42.88642120361328, + "objective/non_score_reward": -2.1443209648132324, + "objective/rlhf_reward": -7.1266858383134455, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 6.443965911865234, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.701171875, + "step": 1029, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.003908157348633 + }, + { + "episode": 16496, + "epoch": 0.09883644294256509, + "loss/policy_avg": 0.6222244501113892, + "lr": 9.34176891615542e-06, + "objective/entropy": -148.41481018066406, + "objective/kl": 38.87040710449219, + "objective/non_score_reward": -1.943520188331604, + "objective/rlhf_reward": -6.258308732303318, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 28.20026397705078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.87109375, + "step": 1030, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9953274726867676 + }, + { + "episode": 16512, + "epoch": 0.09893230758169465, + "loss/policy_avg": 0.04845335707068443, + "lr": 9.341129856850717e-06, + "objective/entropy": -236.35935974121094, + "objective/kl": 28.790306091308594, + "objective/non_score_reward": -1.4395153522491455, + "objective/rlhf_reward": -3.8106498820351913, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 6.143889427185059, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.642578125, + "step": 1031, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9981472492218018 + }, + { + "episode": 16528, + "epoch": 0.09902817222082419, + "loss/policy_avg": 0.1800106167793274, + "lr": 9.340490797546014e-06, + "objective/entropy": -234.52456665039062, + "objective/kl": 38.6103515625, + "objective/non_score_reward": -1.9305176734924316, + "objective/rlhf_reward": -7.722070813179016, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.025315761566162, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58984375, + "step": 1032, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001583099365234 + }, + { + "episode": 16544, + "epoch": 0.09912403685995375, + "loss/policy_avg": 0.1573864221572876, + "lr": 9.33985173824131e-06, + "objective/entropy": -206.30435180664062, + "objective/kl": 29.538883209228516, + "objective/non_score_reward": -1.4769442081451416, + "objective/rlhf_reward": -3.507777070999145, + "objective/scores": 0.6, + "policy/approxkl_avg": 3.956908702850342, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7734375, + "step": 1033, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9989354610443115 + }, + { + "episode": 16560, + "epoch": 0.09921990149908329, + "loss/policy_avg": 0.3316153883934021, + "lr": 9.339212678936606e-06, + "objective/entropy": -158.2957763671875, + "objective/kl": 27.869169235229492, + "objective/non_score_reward": -1.393458366394043, + "objective/rlhf_reward": -5.5738338232040405, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.423194169998169, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.60546875, + "step": 1034, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998276710510254 + }, + { + "episode": 16576, + "epoch": 0.09931576613821284, + "loss/policy_avg": 0.540399432182312, + "lr": 9.338573619631903e-06, + "objective/entropy": -278.6914367675781, + "objective/kl": 24.516807556152344, + "objective/non_score_reward": -1.2258403301239014, + "objective/rlhf_reward": -3.4795294596749224, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 3.2752022743225098, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 1035, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9993618726730347 + }, + { + "episode": 16592, + "epoch": 0.09941163077734239, + "loss/policy_avg": 0.17466121912002563, + "lr": 9.3379345603272e-06, + "objective/entropy": -273.7776794433594, + "objective/kl": 35.438560485839844, + "objective/non_score_reward": -1.7719281911849976, + "objective/rlhf_reward": -5.571941101344761, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 19.215896606445312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.650390625, + "step": 1036, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9990078210830688 + }, + { + "episode": 16608, + "epoch": 0.09950749541647194, + "loss/policy_avg": 0.6281372308731079, + "lr": 9.337295501022495e-06, + "objective/entropy": -54.27313232421875, + "objective/kl": 45.946815490722656, + "objective/non_score_reward": -2.2973408699035645, + "objective/rlhf_reward": -7.6735920546376075, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 10.886024475097656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.931640625, + "step": 1037, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997375011444092 + }, + { + "episode": 16624, + "epoch": 0.0996033600556015, + "loss/policy_avg": 0.5044693350791931, + "lr": 9.336656441717792e-06, + "objective/entropy": -51.8316650390625, + "objective/kl": 34.80516815185547, + "objective/non_score_reward": -1.7402584552764893, + "objective/rlhf_reward": -5.5104356213525385, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 3.0943219661712646, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.837890625, + "step": 1038, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0017142295837402 + }, + { + "episode": 16640, + "epoch": 0.09969922469473104, + "loss/policy_avg": 0.050643354654312134, + "lr": 9.336017382413088e-06, + "objective/entropy": -289.61761474609375, + "objective/kl": 35.579490661621094, + "objective/non_score_reward": -1.7789745330810547, + "objective/rlhf_reward": -5.559638767448023, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 29.854312896728516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.71484375, + "step": 1039, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.99898362159729 + }, + { + "episode": 16656, + "epoch": 0.0997950893338606, + "loss/policy_avg": 0.66060471534729, + "lr": 9.335378323108385e-06, + "objective/entropy": -253.1927490234375, + "objective/kl": 31.551429748535156, + "objective/non_score_reward": -1.5775716304779053, + "objective/rlhf_reward": -4.9847734308540055, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 74.64668273925781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.619140625, + "step": 1040, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998822569847107 + }, + { + "episode": 16672, + "epoch": 0.09989095397299014, + "loss/policy_avg": 0.9751706123352051, + "lr": 9.334739263803682e-06, + "objective/entropy": -148.04188537597656, + "objective/kl": 32.937591552734375, + "objective/non_score_reward": -1.6468796730041504, + "objective/rlhf_reward": -6.587518572807312, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.001709461212158, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.728515625, + "step": 1041, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000730037689209 + }, + { + "episode": 16688, + "epoch": 0.0999868186121197, + "loss/policy_avg": 0.37717461585998535, + "lr": 9.334100204498977e-06, + "objective/entropy": -37.40810012817383, + "objective/kl": 31.557598114013672, + "objective/non_score_reward": -1.5778799057006836, + "objective/rlhf_reward": -4.364108632283147, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 75.23666381835938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.60546875, + "step": 1042, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990663528442383 + }, + { + "episode": 16704, + "epoch": 0.10008268325124924, + "loss/policy_avg": 0.21707114577293396, + "lr": 9.333461145194274e-06, + "objective/entropy": -185.875732421875, + "objective/kl": 31.279882431030273, + "objective/non_score_reward": -1.563994288444519, + "objective/rlhf_reward": -4.133270683065925, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 13.107833862304688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.703125, + "step": 1043, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998514175415039 + }, + { + "episode": 16720, + "epoch": 0.10017854789037879, + "loss/policy_avg": 0.19673524796962738, + "lr": 9.332822085889571e-06, + "objective/entropy": -271.62109375, + "objective/kl": 31.95672607421875, + "objective/non_score_reward": -1.5978362560272217, + "objective/rlhf_reward": -1.9913449048995968, + "objective/scores": 1.1, + "policy/approxkl_avg": 8.022303581237793, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.669921875, + "step": 1044, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9962241649627686 + }, + { + "episode": 16736, + "epoch": 0.10027441252950833, + "loss/policy_avg": 0.36011672019958496, + "lr": 9.332183026584868e-06, + "objective/entropy": -189.5650634765625, + "objective/kl": 27.331592559814453, + "objective/non_score_reward": -1.3665797710418701, + "objective/rlhf_reward": -3.34361249424604, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 12.111129760742188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7890625, + "step": 1045, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9945616722106934 + }, + { + "episode": 16752, + "epoch": 0.10037027716863789, + "loss/policy_avg": 0.24991941452026367, + "lr": 9.331543967280165e-06, + "objective/entropy": -269.1661682128906, + "objective/kl": 29.150144577026367, + "objective/non_score_reward": -1.4575071334838867, + "objective/rlhf_reward": -4.314256751331028, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 39.73731231689453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.591796875, + "step": 1046, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9968986511230469 + }, + { + "episode": 16768, + "epoch": 0.10046614180776743, + "loss/policy_avg": 0.018538065254688263, + "lr": 9.330904907975462e-06, + "objective/entropy": -128.5980224609375, + "objective/kl": 42.25013732910156, + "objective/non_score_reward": -2.112506866455078, + "objective/rlhf_reward": -6.050027823448181, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.4199237823486328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6875, + "step": 1047, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000657320022583 + }, + { + "episode": 16784, + "epoch": 0.10056200644689699, + "loss/policy_avg": 0.35199424624443054, + "lr": 9.330265848670757e-06, + "objective/entropy": -282.9249572753906, + "objective/kl": 34.62944793701172, + "objective/non_score_reward": -1.7314722537994385, + "objective/rlhf_reward": -4.002170358539793, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.832670211791992, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.60546875, + "step": 1048, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9977006912231445 + }, + { + "episode": 16800, + "epoch": 0.10065787108602653, + "loss/policy_avg": -0.12381379306316376, + "lr": 9.329626789366054e-06, + "objective/entropy": -177.63133239746094, + "objective/kl": 29.458477020263672, + "objective/non_score_reward": -1.472923755645752, + "objective/rlhf_reward": -4.229835753858673, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 1.280195713043213, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.638671875, + "step": 1049, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002951145172119 + }, + { + "episode": 16816, + "epoch": 0.10075373572515609, + "loss/policy_avg": 0.06033053621649742, + "lr": 9.32898773006135e-06, + "objective/entropy": -229.76272583007812, + "objective/kl": 25.89266586303711, + "objective/non_score_reward": -1.294633388519287, + "objective/rlhf_reward": -3.7547014548378863, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 3.814189910888672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6640625, + "step": 1050, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9981303215026855 + }, + { + "episode": 16832, + "epoch": 0.10084960036428563, + "loss/policy_avg": -0.14406134188175201, + "lr": 9.328348670756648e-06, + "objective/entropy": -121.60057067871094, + "objective/kl": 34.72946548461914, + "objective/non_score_reward": -1.7364733219146729, + "objective/rlhf_reward": -5.522061069210139, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 4.643096446990967, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.609375, + "step": 1051, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0071685314178467 + }, + { + "episode": 16848, + "epoch": 0.10094546500341518, + "loss/policy_avg": 0.3516131639480591, + "lr": 9.327709611451944e-06, + "objective/entropy": -290.5709228515625, + "objective/kl": 32.417964935302734, + "objective/non_score_reward": -1.6208982467651367, + "objective/rlhf_reward": -2.0835929870605465, + "objective/scores": 1.1, + "policy/approxkl_avg": 106.68559265136719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.578125, + "step": 1052, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9990932941436768 + }, + { + "episode": 16864, + "epoch": 0.10104132964254472, + "loss/policy_avg": -0.2397887408733368, + "lr": 9.32707055214724e-06, + "objective/entropy": -130.25076293945312, + "objective/kl": 37.00995635986328, + "objective/non_score_reward": -1.850497841835022, + "objective/rlhf_reward": -5.978159268100825, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 5.758305072784424, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6875, + "step": 1053, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0002975463867188 + }, + { + "episode": 16880, + "epoch": 0.10113719428167428, + "loss/policy_avg": 0.07710824906826019, + "lr": 9.326431492842537e-06, + "objective/entropy": -265.08575439453125, + "objective/kl": 30.579792022705078, + "objective/non_score_reward": -1.528989553451538, + "objective/rlhf_reward": -3.192239318729612, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 1.1249363422393799, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.720703125, + "step": 1054, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.99985933303833 + }, + { + "episode": 16896, + "epoch": 0.10123305892080382, + "loss/policy_avg": 0.5552304983139038, + "lr": 9.325792433537833e-06, + "objective/entropy": -214.11900329589844, + "objective/kl": 49.237579345703125, + "objective/non_score_reward": -2.461879014968872, + "objective/rlhf_reward": -8.447516059875488, + "objective/scores": 0.35, + "policy/approxkl_avg": 28.872817993164062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.66796875, + "step": 1055, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9940861463546753 + }, + { + "episode": 16912, + "epoch": 0.10132892355993338, + "loss/policy_avg": 0.4369004964828491, + "lr": 9.325153374233129e-06, + "objective/entropy": -218.92349243164062, + "objective/kl": 31.91252899169922, + "objective/non_score_reward": -1.5956264734268188, + "objective/rlhf_reward": -4.435094545559819, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 20.476360321044922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 1056, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99814772605896 + }, + { + "episode": 16928, + "epoch": 0.10142478819906292, + "loss/policy_avg": 0.11664807796478271, + "lr": 9.324514314928425e-06, + "objective/entropy": -241.1952667236328, + "objective/kl": 33.52198791503906, + "objective/non_score_reward": -1.6760993003845215, + "objective/rlhf_reward": -4.971064166227976, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 4.343099594116211, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.822265625, + "step": 1057, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999535083770752 + }, + { + "episode": 16944, + "epoch": 0.10152065283819248, + "loss/policy_avg": -0.033681720495224, + "lr": 9.323875255623722e-06, + "objective/entropy": -244.3253173828125, + "objective/kl": 26.85427474975586, + "objective/non_score_reward": -1.3427138328552246, + "objective/rlhf_reward": -3.7667349911371044, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 5.019390106201172, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.677734375, + "step": 1058, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000871419906616 + }, + { + "episode": 16960, + "epoch": 0.10161651747732202, + "loss/policy_avg": -0.006691465154290199, + "lr": 9.32323619631902e-06, + "objective/entropy": -193.07406616210938, + "objective/kl": 22.30344009399414, + "objective/non_score_reward": -1.115172028541565, + "objective/rlhf_reward": -2.9044288088947083, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 14.373213768005371, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.734375, + "step": 1059, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9997581243515015 + }, + { + "episode": 16976, + "epoch": 0.10171238211645157, + "loss/policy_avg": 0.03293745219707489, + "lr": 9.322597137014316e-06, + "objective/entropy": -276.60870361328125, + "objective/kl": 35.162376403808594, + "objective/non_score_reward": -1.7581188678741455, + "objective/rlhf_reward": -5.632475113868713, + "objective/scores": 0.35, + "policy/approxkl_avg": 6.92661714553833, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 1060, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994102716445923 + }, + { + "episode": 16992, + "epoch": 0.10180824675558112, + "loss/policy_avg": 0.009452302008867264, + "lr": 9.321958077709611e-06, + "objective/entropy": -167.18348693847656, + "objective/kl": 33.525054931640625, + "objective/non_score_reward": -1.676252841949463, + "objective/rlhf_reward": -5.043151860654937, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 0.7187179923057556, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.98828125, + "step": 1061, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.002134323120117 + }, + { + "episode": 17008, + "epoch": 0.10190411139471067, + "loss/policy_avg": 0.2391328066587448, + "lr": 9.321319018404908e-06, + "objective/entropy": -251.56936645507812, + "objective/kl": 31.454349517822266, + "objective/non_score_reward": -1.5727174282073975, + "objective/rlhf_reward": -4.466040725978922, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 2.082510471343994, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.634765625, + "step": 1062, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0036981105804443 + }, + { + "episode": 17024, + "epoch": 0.10199997603384021, + "loss/policy_avg": 0.2995299696922302, + "lr": 9.320679959100205e-06, + "objective/entropy": -240.9496307373047, + "objective/kl": 36.60504913330078, + "objective/non_score_reward": -1.8302524089813232, + "objective/rlhf_reward": -5.716889891687947, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 1.9207489490509033, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.60546875, + "step": 1063, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988347291946411 + }, + { + "episode": 17040, + "epoch": 0.10209584067296977, + "loss/policy_avg": 0.14015616476535797, + "lr": 9.320040899795502e-06, + "objective/entropy": -262.3077392578125, + "objective/kl": 22.77030372619629, + "objective/non_score_reward": -1.1385152339935303, + "objective/rlhf_reward": -3.2124252229029233, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 1.2123262882232666, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.734375, + "step": 1064, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9995969533920288 + }, + { + "episode": 17056, + "epoch": 0.10219170531209931, + "loss/policy_avg": 0.14029760658740997, + "lr": 9.319401840490799e-06, + "objective/entropy": -303.0190734863281, + "objective/kl": 25.82904815673828, + "objective/non_score_reward": -1.2914522886276245, + "objective/rlhf_reward": -3.609549908843592, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.374150276184082, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.634765625, + "step": 1065, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9978749752044678 + }, + { + "episode": 17072, + "epoch": 0.10228756995122887, + "loss/policy_avg": 0.3477242588996887, + "lr": 9.318762781186094e-06, + "objective/entropy": -70.10704040527344, + "objective/kl": 36.12684631347656, + "objective/non_score_reward": -1.806342363357544, + "objective/rlhf_reward": -5.883733919172911, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 4.796685695648193, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.560546875, + "step": 1066, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998852014541626 + }, + { + "episode": 17088, + "epoch": 0.10238343459035841, + "loss/policy_avg": 0.07034695893526077, + "lr": 9.318123721881391e-06, + "objective/entropy": -297.8764343261719, + "objective/kl": 27.875173568725586, + "objective/non_score_reward": -1.3937586545944214, + "objective/rlhf_reward": -5.575034737586975, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.2109901905059814, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58203125, + "step": 1067, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.002016305923462 + }, + { + "episode": 17104, + "epoch": 0.10247929922948797, + "loss/policy_avg": 1.4407649040222168, + "lr": 9.317484662576688e-06, + "objective/entropy": -241.74539184570312, + "objective/kl": 19.868005752563477, + "objective/non_score_reward": -0.9934003353118896, + "objective/rlhf_reward": -2.369481120173054, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 1.7092839479446411, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.673828125, + "step": 1068, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.004781723022461 + }, + { + "episode": 17120, + "epoch": 0.10257516386861751, + "loss/policy_avg": 0.2252398431301117, + "lr": 9.316845603271985e-06, + "objective/entropy": -238.30023193359375, + "objective/kl": 36.790252685546875, + "objective/non_score_reward": -1.839512586593628, + "objective/rlhf_reward": -5.410638998227055, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.8241536617279053, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.71875, + "step": 1069, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990484714508057 + }, + { + "episode": 17136, + "epoch": 0.10267102850774706, + "loss/policy_avg": 0.2009587585926056, + "lr": 9.316206543967282e-06, + "objective/entropy": -281.51422119140625, + "objective/kl": 31.799592971801758, + "objective/non_score_reward": -1.589979648590088, + "objective/rlhf_reward": -3.9599182963371273, + "objective/scores": 0.6, + "policy/approxkl_avg": 11.409127235412598, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73046875, + "step": 1070, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992220401763916 + }, + { + "episode": 17152, + "epoch": 0.1027668931468766, + "loss/policy_avg": 0.07947662472724915, + "lr": 9.315567484662578e-06, + "objective/entropy": -224.4807891845703, + "objective/kl": 26.412246704101562, + "objective/non_score_reward": -1.3206123113632202, + "objective/rlhf_reward": -3.1597432515778876, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 0.5046712756156921, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.568359375, + "step": 1071, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.00076961517334 + }, + { + "episode": 17168, + "epoch": 0.10286275778600616, + "loss/policy_avg": 0.06411048024892807, + "lr": 9.314928425357874e-06, + "objective/entropy": -184.87181091308594, + "objective/kl": 18.737346649169922, + "objective/non_score_reward": -0.9368672370910645, + "objective/rlhf_reward": -1.6247628948846198, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.1642158031463623, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.650390625, + "step": 1072, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995265007019043 + }, + { + "episode": 17184, + "epoch": 0.1029586224251357, + "loss/policy_avg": 0.12491060793399811, + "lr": 9.31428936605317e-06, + "objective/entropy": -264.9185791015625, + "objective/kl": 33.87244415283203, + "objective/non_score_reward": -1.693622350692749, + "objective/rlhf_reward": -4.651782932058845, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.8209168910980225, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.638671875, + "step": 1073, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998373985290527 + }, + { + "episode": 17200, + "epoch": 0.10305448706426526, + "loss/policy_avg": 0.18550439178943634, + "lr": 9.313650306748467e-06, + "objective/entropy": -263.8056335449219, + "objective/kl": 32.30176544189453, + "objective/non_score_reward": -1.6150879859924316, + "objective/rlhf_reward": -4.060352301597595, + "objective/scores": 0.6, + "policy/approxkl_avg": 9.517640113830566, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 1074, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998751163482666 + }, + { + "episode": 17216, + "epoch": 0.1031503517033948, + "loss/policy_avg": 0.03002159669995308, + "lr": 9.313011247443764e-06, + "objective/entropy": -127.8392562866211, + "objective/kl": 34.593231201171875, + "objective/non_score_reward": -1.729661464691162, + "objective/rlhf_reward": -4.518645679950714, + "objective/scores": 0.6, + "policy/approxkl_avg": 8.971546173095703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4599609375, + "step": 1075, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9981253147125244 + }, + { + "episode": 17232, + "epoch": 0.10324621634252436, + "loss/policy_avg": 0.13241755962371826, + "lr": 9.312372188139061e-06, + "objective/entropy": -202.40301513671875, + "objective/kl": 18.52395248413086, + "objective/non_score_reward": -0.9261976480484009, + "objective/rlhf_reward": -2.2238379148796796, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 9.288294792175293, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8125, + "step": 1076, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001105785369873 + }, + { + "episode": 17248, + "epoch": 0.1033420809816539, + "loss/policy_avg": 1.6102979183197021, + "lr": 9.311733128834356e-06, + "objective/entropy": -234.32969665527344, + "objective/kl": 31.251758575439453, + "objective/non_score_reward": -1.5625879764556885, + "objective/rlhf_reward": -4.3029405576752975, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 20.491464614868164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.724609375, + "step": 1077, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.996706485748291 + }, + { + "episode": 17264, + "epoch": 0.10343794562078346, + "loss/policy_avg": -0.1527136266231537, + "lr": 9.311094069529653e-06, + "objective/entropy": -268.0172119140625, + "objective/kl": 27.41750144958496, + "objective/non_score_reward": -1.3708750009536743, + "objective/rlhf_reward": -1.083500242233276, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.5196101665496826, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.677734375, + "step": 1078, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0008769035339355 + }, + { + "episode": 17280, + "epoch": 0.103533810259913, + "loss/policy_avg": 0.062209486961364746, + "lr": 9.310455010224948e-06, + "objective/entropy": -160.53085327148438, + "objective/kl": 35.78590774536133, + "objective/non_score_reward": -1.7892953157424927, + "objective/rlhf_reward": -5.676228764469981, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 5.662154674530029, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.609375, + "step": 1079, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9969273805618286 + }, + { + "episode": 17296, + "epoch": 0.10362967489904255, + "loss/policy_avg": 0.8675416707992554, + "lr": 9.309815950920245e-06, + "objective/entropy": -288.6915283203125, + "objective/kl": 25.7120418548584, + "objective/non_score_reward": -1.28560209274292, + "objective/rlhf_reward": -3.1949969632195785, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 3.7964463233947754, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.748046875, + "step": 1080, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.999279260635376 + }, + { + "episode": 17312, + "epoch": 0.1037255395381721, + "loss/policy_avg": 0.022417806088924408, + "lr": 9.309176891615542e-06, + "objective/entropy": -234.59405517578125, + "objective/kl": 29.527116775512695, + "objective/non_score_reward": -1.476355791091919, + "objective/rlhf_reward": -4.2435640148526295, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 1.2056889533996582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.623046875, + "step": 1081, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0002384185791016 + }, + { + "episode": 17328, + "epoch": 0.10382140417730165, + "loss/policy_avg": 1.0629796981811523, + "lr": 9.308537832310839e-06, + "objective/entropy": -235.58709716796875, + "objective/kl": 24.657703399658203, + "objective/non_score_reward": -1.2328851222991943, + "objective/rlhf_reward": -3.531540727615356, + "objective/scores": 0.35, + "policy/approxkl_avg": 5.628866195678711, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.591796875, + "step": 1082, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9997105598449707 + }, + { + "episode": 17344, + "epoch": 0.1039172688164312, + "loss/policy_avg": 0.43491989374160767, + "lr": 9.307898773006136e-06, + "objective/entropy": -116.438232421875, + "objective/kl": 31.854278564453125, + "objective/non_score_reward": -1.5927139520645142, + "objective/rlhf_reward": -4.970855867862701, + "objective/scores": 0.35, + "policy/approxkl_avg": 11.138096809387207, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.716796875, + "step": 1083, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0005722045898438 + }, + { + "episode": 17360, + "epoch": 0.10401313345556075, + "loss/policy_avg": 1.154296636581421, + "lr": 9.307259713701433e-06, + "objective/entropy": -104.04910278320312, + "objective/kl": 33.66610336303711, + "objective/non_score_reward": -1.6833051443099976, + "objective/rlhf_reward": -3.8095016225588054, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 24.187870025634766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619140625, + "step": 1084, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990277290344238 + }, + { + "episode": 17376, + "epoch": 0.10410899809469029, + "loss/policy_avg": 2.80964732170105, + "lr": 9.306620654396728e-06, + "objective/entropy": -223.38082885742188, + "objective/kl": 42.09947967529297, + "objective/non_score_reward": -2.1049740314483643, + "objective/rlhf_reward": -6.47248501606458, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.849597454071045, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.630859375, + "step": 1085, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001406192779541 + }, + { + "episode": 17392, + "epoch": 0.10420486273381985, + "loss/policy_avg": 0.4371190667152405, + "lr": 9.305981595092025e-06, + "objective/entropy": -209.35194396972656, + "objective/kl": 23.755962371826172, + "objective/non_score_reward": -1.187798023223877, + "objective/rlhf_reward": -3.1470724082628063, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 1.6081452369689941, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.626953125, + "step": 1086, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9983155727386475 + }, + { + "episode": 17408, + "epoch": 0.10430072737294939, + "loss/policy_avg": 0.27756333351135254, + "lr": 9.305342535787322e-06, + "objective/entropy": -262.8760986328125, + "objective/kl": 32.76499938964844, + "objective/non_score_reward": -1.6382498741149902, + "objective/rlhf_reward": -4.948879156176167, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 24.257652282714844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.61328125, + "step": 1087, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9991706609725952 + }, + { + "episode": 17424, + "epoch": 0.10439659201207895, + "loss/policy_avg": -0.05298028513789177, + "lr": 9.304703476482619e-06, + "objective/entropy": -69.1202163696289, + "objective/kl": 30.052305221557617, + "objective/non_score_reward": -1.5026153326034546, + "objective/rlhf_reward": -6.010461330413818, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.539027214050293, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4375, + "step": 1088, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001836061477661 + }, + { + "episode": 17440, + "epoch": 0.10449245665120849, + "loss/policy_avg": 0.7193084955215454, + "lr": 9.304064417177915e-06, + "objective/entropy": -143.99217224121094, + "objective/kl": 29.456846237182617, + "objective/non_score_reward": -1.4728422164916992, + "objective/rlhf_reward": -4.229509656847107, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 3.5728912353515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6484375, + "step": 1089, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997127890586853 + }, + { + "episode": 17456, + "epoch": 0.10458832129033804, + "loss/policy_avg": 0.17522019147872925, + "lr": 9.30342535787321e-06, + "objective/entropy": -233.08404541015625, + "objective/kl": 32.47724914550781, + "objective/non_score_reward": -1.6238625049591064, + "objective/rlhf_reward": -5.116847612944943, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 33.11177444458008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 1090, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9972889423370361 + }, + { + "episode": 17472, + "epoch": 0.10468418592946759, + "loss/policy_avg": 0.15333101153373718, + "lr": 9.302786298568508e-06, + "objective/entropy": -160.20663452148438, + "objective/kl": 36.02931594848633, + "objective/non_score_reward": -1.801465630531311, + "objective/rlhf_reward": -5.690090739520725, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 9.341711044311523, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.564453125, + "step": 1091, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000162124633789 + }, + { + "episode": 17488, + "epoch": 0.10478005056859714, + "loss/policy_avg": 0.13975301384925842, + "lr": 9.302147239263804e-06, + "objective/entropy": -148.38388061523438, + "objective/kl": 37.94308853149414, + "objective/non_score_reward": -1.8971545696258545, + "objective/rlhf_reward": -5.76378917244346, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 20.583585739135742, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6875, + "step": 1092, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984629154205322 + }, + { + "episode": 17504, + "epoch": 0.10487591520772668, + "loss/policy_avg": 0.06423387676477432, + "lr": 9.301508179959101e-06, + "objective/entropy": -251.20310974121094, + "objective/kl": 30.99344825744629, + "objective/non_score_reward": -1.5496724843978882, + "objective/rlhf_reward": -4.873177084952516, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 74.65060424804688, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6484375, + "step": 1093, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9979256391525269 + }, + { + "episode": 17520, + "epoch": 0.10497177984685624, + "loss/policy_avg": 0.045309893786907196, + "lr": 9.300869120654398e-06, + "objective/entropy": -231.59390258789062, + "objective/kl": 39.9537353515625, + "objective/non_score_reward": -1.9976863861083984, + "objective/rlhf_reward": -5.590746021270752, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.6203057765960693, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.744140625, + "step": 1094, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9998950958251953 + }, + { + "episode": 17536, + "epoch": 0.1050676444859858, + "loss/policy_avg": 0.784805953502655, + "lr": 9.300230061349695e-06, + "objective/entropy": -211.55604553222656, + "objective/kl": 30.87300682067871, + "objective/non_score_reward": -1.5436503887176514, + "objective/rlhf_reward": -3.250882480980131, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 40.055843353271484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.708984375, + "step": 1095, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0004684925079346 + }, + { + "episode": 17552, + "epoch": 0.10516350912511534, + "loss/policy_avg": -0.08781934529542923, + "lr": 9.29959100204499e-06, + "objective/entropy": -234.98513793945312, + "objective/kl": 32.781734466552734, + "objective/non_score_reward": -1.6390867233276367, + "objective/rlhf_reward": -5.156346833705902, + "objective/scores": 0.35, + "policy/approxkl_avg": 6.987787246704102, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6953125, + "step": 1096, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0007686614990234 + }, + { + "episode": 17568, + "epoch": 0.10525937376424489, + "loss/policy_avg": 0.01477903313934803, + "lr": 9.298951942740287e-06, + "objective/entropy": -247.9517822265625, + "objective/kl": 34.785831451416016, + "objective/non_score_reward": -1.7392916679382324, + "objective/rlhf_reward": -5.009755204396184, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.2163832187652588, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.609375, + "step": 1097, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9993152618408203 + }, + { + "episode": 17584, + "epoch": 0.10535523840337443, + "loss/policy_avg": 0.1219930574297905, + "lr": 9.298312883435584e-06, + "objective/entropy": -219.2138671875, + "objective/kl": 25.922840118408203, + "objective/non_score_reward": -1.2961418628692627, + "objective/rlhf_reward": -3.703615131790995, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 97.83702087402344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.689453125, + "step": 1098, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.998638391494751 + }, + { + "episode": 17600, + "epoch": 0.10545110304250399, + "loss/policy_avg": 0.9850329756736755, + "lr": 9.29767382413088e-06, + "objective/entropy": -280.9995422363281, + "objective/kl": 37.09015655517578, + "objective/non_score_reward": -1.8545079231262207, + "objective/rlhf_reward": -5.470620225148137, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 15.378658294677734, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.703125, + "step": 1099, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981629848480225 + }, + { + "episode": 17616, + "epoch": 0.10554696768163353, + "loss/policy_avg": 0.16606320440769196, + "lr": 9.297034764826178e-06, + "objective/entropy": -260.265625, + "objective/kl": 19.693069458007812, + "objective/non_score_reward": -0.9846534132957458, + "objective/rlhf_reward": -2.3344938195386704, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 10.92020320892334, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6640625, + "step": 1100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996177315711975 + }, + { + "episode": 17632, + "epoch": 0.10564283232076309, + "loss/policy_avg": 0.01635119318962097, + "lr": 9.296395705521473e-06, + "objective/entropy": -205.85324096679688, + "objective/kl": 33.84467697143555, + "objective/non_score_reward": -1.6922338008880615, + "objective/rlhf_reward": -5.390333392707211, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 9.969751358032227, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.669921875, + "step": 1101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0005996227264404 + }, + { + "episode": 17648, + "epoch": 0.10573869695989263, + "loss/policy_avg": -0.09314411878585815, + "lr": 9.29575664621677e-06, + "objective/entropy": -140.11074829101562, + "objective/kl": 30.367794036865234, + "objective/non_score_reward": -1.5183897018432617, + "objective/rlhf_reward": -4.592606308873057, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 20.063873291015625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.650390625, + "step": 1102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999199628829956 + }, + { + "episode": 17664, + "epoch": 0.10583456159902219, + "loss/policy_avg": 0.6026681661605835, + "lr": 9.295117586912065e-06, + "objective/entropy": -229.55003356933594, + "objective/kl": 40.14759826660156, + "objective/non_score_reward": -2.0073800086975098, + "objective/rlhf_reward": -6.296186701456705, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 5.752803802490234, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.609375, + "step": 1103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99612295627594 + }, + { + "episode": 17680, + "epoch": 0.10593042623815173, + "loss/policy_avg": 0.4246598184108734, + "lr": 9.294478527607362e-06, + "objective/entropy": -282.4384460449219, + "objective/kl": 41.07707977294922, + "objective/non_score_reward": -2.053853988647461, + "objective/rlhf_reward": -6.856166326735897, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 23.673992156982422, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.650390625, + "step": 1104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999070405960083 + }, + { + "episode": 17696, + "epoch": 0.10602629087728128, + "loss/policy_avg": 0.37388309836387634, + "lr": 9.293839468302659e-06, + "objective/entropy": 24.34271240234375, + "objective/kl": 43.73130798339844, + "objective/non_score_reward": -2.186565399169922, + "objective/rlhf_reward": -7.295662741275176, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 5.244170188903809, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.64453125, + "step": 1105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000379800796509 + }, + { + "episode": 17712, + "epoch": 0.10612215551641083, + "loss/policy_avg": 0.48876816034317017, + "lr": 9.293200408997956e-06, + "objective/entropy": -201.01852416992188, + "objective/kl": 26.633869171142578, + "objective/non_score_reward": -1.3316935300827026, + "objective/rlhf_reward": -3.9481719518579066, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 2.9823970794677734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.552734375, + "step": 1106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.996859073638916 + }, + { + "episode": 17728, + "epoch": 0.10621802015554038, + "loss/policy_avg": -0.03377959132194519, + "lr": 9.292561349693252e-06, + "objective/entropy": -243.04660034179688, + "objective/kl": 32.35979080200195, + "objective/non_score_reward": -1.6179895401000977, + "objective/rlhf_reward": -5.048125703533259, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 1.477148175239563, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.65625, + "step": 1107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001762628555298 + }, + { + "episode": 17744, + "epoch": 0.10631388479466992, + "loss/policy_avg": -0.23846808075904846, + "lr": 9.29192229038855e-06, + "objective/entropy": -251.7974395751953, + "objective/kl": 30.760231018066406, + "objective/non_score_reward": -1.5380115509033203, + "objective/rlhf_reward": -4.701447825045928, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 8.709911346435547, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.52734375, + "step": 1108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0022435188293457 + }, + { + "episode": 17760, + "epoch": 0.10640974943379948, + "loss/policy_avg": 0.19507169723510742, + "lr": 9.291283231083845e-06, + "objective/entropy": -236.431396484375, + "objective/kl": 29.49862289428711, + "objective/non_score_reward": -1.474931240081787, + "objective/rlhf_reward": -4.237865214765654, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 18.90414047241211, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6953125, + "step": 1109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998689889907837 + }, + { + "episode": 17776, + "epoch": 0.10650561407292902, + "loss/policy_avg": 0.08301146328449249, + "lr": 9.290644171779141e-06, + "objective/entropy": -275.0250244140625, + "objective/kl": 41.055580139160156, + "objective/non_score_reward": -2.052779197692871, + "objective/rlhf_reward": -6.477783219019571, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 14.971565246582031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 1110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0006604194641113 + }, + { + "episode": 17792, + "epoch": 0.10660147871205858, + "loss/policy_avg": 1.2557047605514526, + "lr": 9.290005112474438e-06, + "objective/entropy": -183.14273071289062, + "objective/kl": 28.433589935302734, + "objective/non_score_reward": -1.4216796159744263, + "objective/rlhf_reward": -4.28671840429306, + "objective/scores": 0.35, + "policy/approxkl_avg": 7.521367073059082, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.771484375, + "step": 1111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.00211763381958 + }, + { + "episode": 17808, + "epoch": 0.10669734335118812, + "loss/policy_avg": -0.1782451868057251, + "lr": 9.289366053169735e-06, + "objective/entropy": -279.40826416015625, + "objective/kl": 18.467693328857422, + "objective/non_score_reward": -0.9233846068382263, + "objective/rlhf_reward": -2.334288516376896, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 4.5754899978637695, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.685546875, + "step": 1112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0029964447021484 + }, + { + "episode": 17824, + "epoch": 0.10679320799031768, + "loss/policy_avg": 0.03669451177120209, + "lr": 9.288726993865032e-06, + "objective/entropy": -223.73326110839844, + "objective/kl": 29.530508041381836, + "objective/non_score_reward": -1.4765253067016602, + "objective/rlhf_reward": -3.783395232931648, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.630830764770508, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62109375, + "step": 1113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9987865686416626 + }, + { + "episode": 17840, + "epoch": 0.10688907262944722, + "loss/policy_avg": 0.8654987215995789, + "lr": 9.288087934560327e-06, + "objective/entropy": -210.11935424804688, + "objective/kl": 29.22211456298828, + "objective/non_score_reward": -1.4611058235168457, + "objective/rlhf_reward": -4.288163571563318, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 0.9125807285308838, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.591796875, + "step": 1114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.004485845565796 + }, + { + "episode": 17856, + "epoch": 0.10698493726857677, + "loss/policy_avg": -0.06222856044769287, + "lr": 9.287448875255624e-06, + "objective/entropy": -234.88995361328125, + "objective/kl": 29.992103576660156, + "objective/non_score_reward": -1.4996052980422974, + "objective/rlhf_reward": -3.074702118278715, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 3.34584903717041, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7265625, + "step": 1115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993617534637451 + }, + { + "episode": 17872, + "epoch": 0.10708080190770632, + "loss/policy_avg": 0.20112337172031403, + "lr": 9.286809815950921e-06, + "objective/entropy": -230.16200256347656, + "objective/kl": 30.825511932373047, + "objective/non_score_reward": -1.5412755012512207, + "objective/rlhf_reward": -4.714504103274688, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 12.865804672241211, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6953125, + "step": 1116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9985527992248535 + }, + { + "episode": 17888, + "epoch": 0.10717666654683587, + "loss/policy_avg": 0.6556056141853333, + "lr": 9.286170756646218e-06, + "objective/entropy": -280.40069580078125, + "objective/kl": 28.695655822753906, + "objective/non_score_reward": -1.434782862663269, + "objective/rlhf_reward": -3.6164251587548595, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.831923007965088, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 1117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9997923374176025 + }, + { + "episode": 17904, + "epoch": 0.10727253118596541, + "loss/policy_avg": 0.1591615378856659, + "lr": 9.285531697341515e-06, + "objective/entropy": -208.41720581054688, + "objective/kl": 32.10327911376953, + "objective/non_score_reward": -1.605163812637329, + "objective/rlhf_reward": -5.042053201285702, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 16.582778930664062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5390625, + "step": 1118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988048076629639 + }, + { + "episode": 17920, + "epoch": 0.10736839582509497, + "loss/policy_avg": 0.6213997602462769, + "lr": 9.284892638036812e-06, + "objective/entropy": -174.9388427734375, + "objective/kl": 22.156795501708984, + "objective/non_score_reward": -1.107839584350586, + "objective/rlhf_reward": -2.698025361696879, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 2.6573128700256348, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4931640625, + "step": 1119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001469612121582 + }, + { + "episode": 17936, + "epoch": 0.10746426046422451, + "loss/policy_avg": 0.15051786601543427, + "lr": 9.284253578732107e-06, + "objective/entropy": -67.49928283691406, + "objective/kl": 43.85652160644531, + "objective/non_score_reward": -2.19282603263855, + "objective/rlhf_reward": -7.3207059904054255, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 4.555420875549316, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6640625, + "step": 1120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001720666885376 + }, + { + "episode": 17952, + "epoch": 0.10756012510335407, + "loss/policy_avg": -0.04347284138202667, + "lr": 9.283614519427404e-06, + "objective/entropy": -228.60853576660156, + "objective/kl": 27.952720642089844, + "objective/non_score_reward": -1.39763605594635, + "objective/rlhf_reward": -5.590544044971466, + "objective/scores": 0.0, + "policy/approxkl_avg": 23.599834442138672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.76953125, + "step": 1121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0020194053649902 + }, + { + "episode": 17968, + "epoch": 0.10765598974248361, + "loss/policy_avg": -0.053687386214733124, + "lr": 9.2829754601227e-06, + "objective/entropy": -207.92953491210938, + "objective/kl": 39.524742126464844, + "objective/non_score_reward": -1.9762370586395264, + "objective/rlhf_reward": -5.957537005619939, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 9.628499984741211, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6171875, + "step": 1122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000182867050171 + }, + { + "episode": 17984, + "epoch": 0.10775185438161317, + "loss/policy_avg": -0.1910426765680313, + "lr": 9.282336400817996e-06, + "objective/entropy": -152.73464965820312, + "objective/kl": 33.28754425048828, + "objective/non_score_reward": -1.664376974105835, + "objective/rlhf_reward": -5.141736590655979, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 5.299195289611816, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.73828125, + "step": 1123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9999160766601562 + }, + { + "episode": 18000, + "epoch": 0.10784771902074271, + "loss/policy_avg": 0.23040008544921875, + "lr": 9.281697341513293e-06, + "objective/entropy": -260.4175109863281, + "objective/kl": 27.83688735961914, + "objective/non_score_reward": -1.391844391822815, + "objective/rlhf_reward": -2.64365843379614, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 1.275976538658142, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.642578125, + "step": 1124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999161720275879 + }, + { + "episode": 18016, + "epoch": 0.10794358365987226, + "loss/policy_avg": 0.38624101877212524, + "lr": 9.28105828220859e-06, + "objective/entropy": -278.8191833496094, + "objective/kl": 41.93511962890625, + "objective/non_score_reward": -2.0967559814453125, + "objective/rlhf_reward": -5.987024164199829, + "objective/scores": 0.6, + "policy/approxkl_avg": 5.510004043579102, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 1125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.000148057937622 + }, + { + "episode": 18032, + "epoch": 0.1080394482990018, + "loss/policy_avg": 0.07502768188714981, + "lr": 9.280419222903886e-06, + "objective/entropy": -261.2082824707031, + "objective/kl": 36.19464111328125, + "objective/non_score_reward": -1.80973219871521, + "objective/rlhf_reward": -5.291517208294804, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 31.399539947509766, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.599609375, + "step": 1126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9981523752212524 + }, + { + "episode": 18048, + "epoch": 0.10813531293813136, + "loss/policy_avg": 0.027504732832312584, + "lr": 9.279780163599183e-06, + "objective/entropy": -173.93919372558594, + "objective/kl": 38.43782424926758, + "objective/non_score_reward": -1.921891212463379, + "objective/rlhf_reward": -6.236966948123321, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 4.174002647399902, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.578125, + "step": 1127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981164932250977 + }, + { + "episode": 18064, + "epoch": 0.1082311775772609, + "loss/policy_avg": -0.27174612879753113, + "lr": 9.279141104294478e-06, + "objective/entropy": -244.70285034179688, + "objective/kl": 29.41028594970703, + "objective/non_score_reward": -1.4705145359039307, + "objective/rlhf_reward": -5.8820579051971436, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.355351448059082, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.76171875, + "step": 1128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0021891593933105 + }, + { + "episode": 18080, + "epoch": 0.10832704221639046, + "loss/policy_avg": 0.1301630437374115, + "lr": 9.278502044989775e-06, + "objective/entropy": -217.2534942626953, + "objective/kl": 24.805774688720703, + "objective/non_score_reward": -1.2402887344360352, + "objective/rlhf_reward": -3.5105568572000116, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 179.41348266601562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.681640625, + "step": 1129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9998412132263184 + }, + { + "episode": 18096, + "epoch": 0.10842290685552, + "loss/policy_avg": 0.29972007870674133, + "lr": 9.277862985685072e-06, + "objective/entropy": -165.94686889648438, + "objective/kl": 33.62857437133789, + "objective/non_score_reward": -1.6814286708831787, + "objective/rlhf_reward": -5.063855295599089, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 9.789844512939453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.54296875, + "step": 1130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996779441833496 + }, + { + "episode": 18112, + "epoch": 0.10851877149464956, + "loss/policy_avg": -0.1860085129737854, + "lr": 9.277223926380369e-06, + "objective/entropy": -216.37200927734375, + "objective/kl": 34.99008560180664, + "objective/non_score_reward": -1.7495043277740479, + "objective/rlhf_reward": -5.517064335759043, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 1.947920799255371, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.591796875, + "step": 1131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001011371612549 + }, + { + "episode": 18128, + "epoch": 0.1086146361337791, + "loss/policy_avg": 1.0164711475372314, + "lr": 9.276584867075666e-06, + "objective/entropy": -198.08203125, + "objective/kl": 27.897228240966797, + "objective/non_score_reward": -1.3948614597320557, + "objective/rlhf_reward": -4.253932688265962, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 3.5322012901306152, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5859375, + "step": 1132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0027313232421875 + }, + { + "episode": 18144, + "epoch": 0.10871050077290866, + "loss/policy_avg": -0.12127675116062164, + "lr": 9.275945807770961e-06, + "objective/entropy": -220.23248291015625, + "objective/kl": 32.97924041748047, + "objective/non_score_reward": -1.6489620208740234, + "objective/rlhf_reward": -4.648436854557927, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 18.976924896240234, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.69140625, + "step": 1133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001431465148926 + }, + { + "episode": 18160, + "epoch": 0.1088063654120382, + "loss/policy_avg": 0.2887868881225586, + "lr": 9.275306748466258e-06, + "objective/entropy": -276.16912841796875, + "objective/kl": 37.935035705566406, + "objective/non_score_reward": -1.8967517614364624, + "objective/rlhf_reward": -6.10605442803657, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 18.96986961364746, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.625, + "step": 1134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982385635375977 + }, + { + "episode": 18176, + "epoch": 0.10890223005116775, + "loss/policy_avg": 0.448369562625885, + "lr": 9.274667689161555e-06, + "objective/entropy": -169.45448303222656, + "objective/kl": 37.67509078979492, + "objective/non_score_reward": -1.8837544918060303, + "objective/rlhf_reward": -6.084420065493926, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 27.178815841674805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 1135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000478506088257 + }, + { + "episode": 18192, + "epoch": 0.1089980946902973, + "loss/policy_avg": 0.5679232478141785, + "lr": 9.274028629856852e-06, + "objective/entropy": -180.7431182861328, + "objective/kl": 39.18467330932617, + "objective/non_score_reward": -1.9592337608337402, + "objective/rlhf_reward": -6.103601590792337, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 27.431682586669922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58203125, + "step": 1136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9959231615066528 + }, + { + "episode": 18208, + "epoch": 0.10909395932942685, + "loss/policy_avg": 0.08655049651861191, + "lr": 9.273389570552149e-06, + "objective/entropy": -251.33828735351562, + "objective/kl": 30.559293746948242, + "objective/non_score_reward": -1.52796471118927, + "objective/rlhf_reward": -4.164447735028203, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.2445521354675293, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.689453125, + "step": 1137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9996728897094727 + }, + { + "episode": 18224, + "epoch": 0.1091898239685564, + "loss/policy_avg": 0.4302634000778198, + "lr": 9.272750511247446e-06, + "objective/entropy": -201.8494873046875, + "objective/kl": 29.424352645874023, + "objective/non_score_reward": -1.4712176322937012, + "objective/rlhf_reward": -1.4848706483840939, + "objective/scores": 1.1, + "policy/approxkl_avg": 20.697341918945312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8203125, + "step": 1138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9989333152770996 + }, + { + "episode": 18240, + "epoch": 0.10928568860768595, + "loss/policy_avg": 0.9915270209312439, + "lr": 9.27211145194274e-06, + "objective/entropy": -195.59429931640625, + "objective/kl": 21.045230865478516, + "objective/non_score_reward": -1.052261471748352, + "objective/rlhf_reward": -2.80904603600502, + "objective/scores": 0.35, + "policy/approxkl_avg": 28.377094268798828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71484375, + "step": 1139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.998765230178833 + }, + { + "episode": 18256, + "epoch": 0.10938155324681549, + "loss/policy_avg": 0.49453747272491455, + "lr": 9.271472392638038e-06, + "objective/entropy": -245.22964477539062, + "objective/kl": 32.85436248779297, + "objective/non_score_reward": -1.6427181959152222, + "objective/rlhf_reward": -5.089920165951609, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 14.714433670043945, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.599609375, + "step": 1140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.996521234512329 + }, + { + "episode": 18272, + "epoch": 0.10947741788594505, + "loss/policy_avg": 1.36152184009552, + "lr": 9.270833333333334e-06, + "objective/entropy": -272.47137451171875, + "objective/kl": 34.61804962158203, + "objective/non_score_reward": -1.7309024333953857, + "objective/rlhf_reward": -5.581974318533568, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 4.44586181640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.654296875, + "step": 1141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9999563694000244 + }, + { + "episode": 18288, + "epoch": 0.10957328252507459, + "loss/policy_avg": 0.2819780111312866, + "lr": 9.270194274028631e-06, + "objective/entropy": -202.5043487548828, + "objective/kl": 25.666091918945312, + "objective/non_score_reward": -1.2833045721054077, + "objective/rlhf_reward": -3.7739683029398154, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 2.4244799613952637, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.552734375, + "step": 1142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999279260635376 + }, + { + "episode": 18304, + "epoch": 0.10966914716420414, + "loss/policy_avg": 0.25256872177124023, + "lr": 9.269555214723928e-06, + "objective/entropy": -231.06277465820312, + "objective/kl": 30.289072036743164, + "objective/non_score_reward": -1.514453649520874, + "objective/rlhf_reward": -4.657814359664917, + "objective/scores": 0.35, + "policy/approxkl_avg": 17.748353958129883, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71875, + "step": 1143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9983248710632324 + }, + { + "episode": 18320, + "epoch": 0.10976501180333369, + "loss/policy_avg": -0.33820840716362, + "lr": 9.268916155419223e-06, + "objective/entropy": -73.95364379882812, + "objective/kl": 28.924686431884766, + "objective/non_score_reward": -1.4462342262268066, + "objective/rlhf_reward": -4.303984346802592, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 2.515535831451416, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.779296875, + "step": 1144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0056748390197754 + }, + { + "episode": 18336, + "epoch": 0.10986087644246324, + "loss/policy_avg": 0.6078078746795654, + "lr": 9.26827709611452e-06, + "objective/entropy": -114.01469421386719, + "objective/kl": 33.08042526245117, + "objective/non_score_reward": -1.6540212631225586, + "objective/rlhf_reward": -5.13513237517631, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 18.6502628326416, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 1145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998767614364624 + }, + { + "episode": 18352, + "epoch": 0.10995674108159278, + "loss/policy_avg": 0.34172698855400085, + "lr": 9.267638036809816e-06, + "objective/entropy": -220.97189331054688, + "objective/kl": 30.25277328491211, + "objective/non_score_reward": -1.512638807296753, + "objective/rlhf_reward": -4.317221657435099, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.9541758298873901, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.603515625, + "step": 1146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0016610622406006 + }, + { + "episode": 18368, + "epoch": 0.11005260572072234, + "loss/policy_avg": 0.29632118344306946, + "lr": 9.266998977505112e-06, + "objective/entropy": -200.36410522460938, + "objective/kl": 26.179067611694336, + "objective/non_score_reward": -1.3089535236358643, + "objective/rlhf_reward": -2.835813796520233, + "objective/scores": 0.6, + "policy/approxkl_avg": 3.2951159477233887, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.576171875, + "step": 1147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.004591464996338 + }, + { + "episode": 18384, + "epoch": 0.11014847035985188, + "loss/policy_avg": 0.011747203767299652, + "lr": 9.26635991820041e-06, + "objective/entropy": -194.40054321289062, + "objective/kl": 31.329753875732422, + "objective/non_score_reward": -1.5664877891540527, + "objective/rlhf_reward": -4.784998300488352, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 1.8787118196487427, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.466796875, + "step": 1148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0023856163024902 + }, + { + "episode": 18400, + "epoch": 0.11024433499898144, + "loss/policy_avg": 0.46494680643081665, + "lr": 9.265720858895706e-06, + "objective/entropy": -223.58827209472656, + "objective/kl": 28.735855102539062, + "objective/non_score_reward": -1.4367928504943848, + "objective/rlhf_reward": -4.347171103954315, + "objective/scores": 0.35, + "policy/approxkl_avg": 3.884065866470337, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.490234375, + "step": 1149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982936382293701 + }, + { + "episode": 18416, + "epoch": 0.11034019963811098, + "loss/policy_avg": 0.28439557552337646, + "lr": 9.265081799591003e-06, + "objective/entropy": -147.24366760253906, + "objective/kl": 34.880985260009766, + "objective/non_score_reward": -1.7440491914749146, + "objective/rlhf_reward": -5.242863551775614, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 79.35762023925781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.734375, + "step": 1150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9970107078552246 + }, + { + "episode": 18432, + "epoch": 0.11043606427724054, + "loss/policy_avg": 0.3585757613182068, + "lr": 9.2644427402863e-06, + "objective/entropy": -71.19611358642578, + "objective/kl": 26.725967407226562, + "objective/non_score_reward": -1.3362984657287598, + "objective/rlhf_reward": -3.8642411856011147, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 5.22227668762207, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4990234375, + "step": 1151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982261657714844 + }, + { + "episode": 18448, + "epoch": 0.11053192891637008, + "loss/policy_avg": -0.044132016599178314, + "lr": 9.263803680981595e-06, + "objective/entropy": -228.6917724609375, + "objective/kl": 28.40880584716797, + "objective/non_score_reward": -1.4204403162002563, + "objective/rlhf_reward": -5.6817615032196045, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9962811470031738, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.66015625, + "step": 1152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992928504943848 + }, + { + "episode": 18464, + "epoch": 0.11062779355549963, + "loss/policy_avg": 0.0064825452864170074, + "lr": 9.263164621676892e-06, + "objective/entropy": -258.4649658203125, + "objective/kl": 27.05806541442871, + "objective/non_score_reward": -1.3529033660888672, + "objective/rlhf_reward": -3.9306607274368996, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 6.10453462600708, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.546875, + "step": 1153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0005645751953125 + }, + { + "episode": 18480, + "epoch": 0.11072365819462919, + "loss/policy_avg": -0.0035511665046215057, + "lr": 9.262525562372189e-06, + "objective/entropy": -282.88446044921875, + "objective/kl": 30.65878677368164, + "objective/non_score_reward": -1.5329391956329346, + "objective/rlhf_reward": -4.575497834888056, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.0608371496200562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.693359375, + "step": 1154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0007755756378174 + }, + { + "episode": 18496, + "epoch": 0.11081952283375873, + "loss/policy_avg": 0.02788732573390007, + "lr": 9.261886503067486e-06, + "objective/entropy": -116.1088638305664, + "objective/kl": 30.207550048828125, + "objective/non_score_reward": -1.5103774070739746, + "objective/rlhf_reward": -3.918803753630195, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 75.21327209472656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619140625, + "step": 1155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9990315437316895 + }, + { + "episode": 18512, + "epoch": 0.11091538747288829, + "loss/policy_avg": 0.32094255089759827, + "lr": 9.261247443762783e-06, + "objective/entropy": -214.9591064453125, + "objective/kl": 27.392032623291016, + "objective/non_score_reward": -1.3696017265319824, + "objective/rlhf_reward": -3.355700316206489, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 28.679149627685547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.572265625, + "step": 1156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9985802173614502 + }, + { + "episode": 18528, + "epoch": 0.11101125211201783, + "loss/policy_avg": 0.20208770036697388, + "lr": 9.260608384458078e-06, + "objective/entropy": -160.56893920898438, + "objective/kl": 37.55027770996094, + "objective/non_score_reward": -1.8775138854980469, + "objective/rlhf_reward": -5.848196154058563, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 12.605989456176758, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.50390625, + "step": 1157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999098777770996 + }, + { + "episode": 18544, + "epoch": 0.11110711675114739, + "loss/policy_avg": 0.3753480613231659, + "lr": 9.259969325153375e-06, + "objective/entropy": -242.1776123046875, + "objective/kl": 39.135337829589844, + "objective/non_score_reward": -1.9567670822143555, + "objective/rlhf_reward": -5.427068269252777, + "objective/scores": 0.6, + "policy/approxkl_avg": 35.09158706665039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 1158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99853515625 + }, + { + "episode": 18560, + "epoch": 0.11120298139027693, + "loss/policy_avg": -0.17678791284561157, + "lr": 9.259330265848672e-06, + "objective/entropy": -155.45452880859375, + "objective/kl": 29.033279418945312, + "objective/non_score_reward": -1.4516640901565552, + "objective/rlhf_reward": -4.40665636062622, + "objective/scores": 0.35, + "policy/approxkl_avg": 9.420263290405273, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.640625, + "step": 1159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994099140167236 + }, + { + "episode": 18576, + "epoch": 0.11129884602940648, + "loss/policy_avg": 0.2095283716917038, + "lr": 9.258691206543968e-06, + "objective/entropy": -245.34713745117188, + "objective/kl": 27.264514923095703, + "objective/non_score_reward": -1.3632256984710693, + "objective/rlhf_reward": -3.896643607822016, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 14.997028350830078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 1160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9966801404953003 + }, + { + "episode": 18592, + "epoch": 0.11139471066853603, + "loss/policy_avg": 0.35818007588386536, + "lr": 9.258052147239265e-06, + "objective/entropy": -235.89605712890625, + "objective/kl": 27.769607543945312, + "objective/non_score_reward": -1.3884804248809814, + "objective/rlhf_reward": -1.153921282291412, + "objective/scores": 1.1, + "policy/approxkl_avg": 22.18886375427246, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.771484375, + "step": 1161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9975862503051758 + }, + { + "episode": 18608, + "epoch": 0.11149057530766558, + "loss/policy_avg": 0.31447115540504456, + "lr": 9.257413087934562e-06, + "objective/entropy": -129.99705505371094, + "objective/kl": 39.8328742980957, + "objective/non_score_reward": -1.9916437864303589, + "objective/rlhf_reward": -6.410315840449885, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 15.095479011535645, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7265625, + "step": 1162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9973938465118408 + }, + { + "episode": 18624, + "epoch": 0.11158643994679512, + "loss/policy_avg": 0.0677080750465393, + "lr": 9.256774028629857e-06, + "objective/entropy": -159.806884765625, + "objective/kl": 28.60342788696289, + "objective/non_score_reward": -1.4301713705062866, + "objective/rlhf_reward": -4.3420833135522425, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 4.259771347045898, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.65625, + "step": 1163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000201940536499 + }, + { + "episode": 18640, + "epoch": 0.11168230458592468, + "loss/policy_avg": 0.19306568801403046, + "lr": 9.256134969325154e-06, + "objective/entropy": -209.5618133544922, + "objective/kl": 19.490875244140625, + "objective/non_score_reward": -0.9745436906814575, + "objective/rlhf_reward": -0.9744557484400003, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 0.33440613746643066, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.59765625, + "step": 1164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9999253749847412 + }, + { + "episode": 18656, + "epoch": 0.11177816922505422, + "loss/policy_avg": 0.11631269752979279, + "lr": 9.255495910020451e-06, + "objective/entropy": -141.29168701171875, + "objective/kl": 37.15015411376953, + "objective/non_score_reward": -1.8575077056884766, + "objective/rlhf_reward": -5.873771755900934, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.6151018142700195, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51953125, + "step": 1165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0023040771484375 + }, + { + "episode": 18672, + "epoch": 0.11187403386418378, + "loss/policy_avg": 0.01576380617916584, + "lr": 9.254856850715748e-06, + "objective/entropy": -215.0299835205078, + "objective/kl": 14.439537048339844, + "objective/non_score_reward": -0.7219768762588501, + "objective/rlhf_reward": -2.8879075050354004, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.3693623542785645, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.509765625, + "step": 1166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.003568172454834 + }, + { + "episode": 18688, + "epoch": 0.11196989850331332, + "loss/policy_avg": 0.08836716413497925, + "lr": 9.254217791411043e-06, + "objective/entropy": -220.81651306152344, + "objective/kl": 27.33843994140625, + "objective/non_score_reward": -1.3669219017028809, + "objective/rlhf_reward": -3.951916241439518, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 1.9005239009857178, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7109375, + "step": 1167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.998638391494751 + }, + { + "episode": 18704, + "epoch": 0.11206576314244288, + "loss/policy_avg": 0.1386057436466217, + "lr": 9.25357873210634e-06, + "objective/entropy": -206.8209686279297, + "objective/kl": 30.620820999145508, + "objective/non_score_reward": -1.531041145324707, + "objective/rlhf_reward": -4.390831009546916, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 2.888638973236084, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 1168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985263347625732 + }, + { + "episode": 18720, + "epoch": 0.11216162778157242, + "loss/policy_avg": 0.17286451160907745, + "lr": 9.252939672801637e-06, + "objective/entropy": -276.5692138671875, + "objective/kl": 31.233203887939453, + "objective/non_score_reward": -1.5616602897644043, + "objective/rlhf_reward": -4.730869614871677, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 7.134778022766113, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.671875, + "step": 1169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9994456768035889 + }, + { + "episode": 18736, + "epoch": 0.11225749242070197, + "loss/policy_avg": 0.31586384773254395, + "lr": 9.252300613496932e-06, + "objective/entropy": -248.99765014648438, + "objective/kl": 33.04867172241211, + "objective/non_score_reward": -1.6524336338043213, + "objective/rlhf_reward": -5.005614492956715, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 6.374646186828613, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.626953125, + "step": 1170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998569369316101 + }, + { + "episode": 18752, + "epoch": 0.11235335705983152, + "loss/policy_avg": -0.09263397008180618, + "lr": 9.251661554192229e-06, + "objective/entropy": -183.73135375976562, + "objective/kl": 29.070640563964844, + "objective/non_score_reward": -1.4535319805145264, + "objective/rlhf_reward": -3.9892991736260166, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 47.9519157409668, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.751953125, + "step": 1171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0019617080688477 + }, + { + "episode": 18768, + "epoch": 0.11244922169896107, + "loss/policy_avg": -0.012390676885843277, + "lr": 9.251022494887526e-06, + "objective/entropy": -198.5019073486328, + "objective/kl": 33.66993713378906, + "objective/non_score_reward": -1.6834967136383057, + "objective/rlhf_reward": -5.333986735343933, + "objective/scores": 0.35, + "policy/approxkl_avg": 7.644756317138672, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6640625, + "step": 1172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0004210472106934 + }, + { + "episode": 18784, + "epoch": 0.11254508633809061, + "loss/policy_avg": -0.12474697828292847, + "lr": 9.250383435582823e-06, + "objective/entropy": -258.70025634765625, + "objective/kl": 36.01386260986328, + "objective/non_score_reward": -1.8006932735443115, + "objective/rlhf_reward": -5.824170806495053, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 11.808134078979492, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.671875, + "step": 1173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0037684440612793 + }, + { + "episode": 18800, + "epoch": 0.11264095097722017, + "loss/policy_avg": 0.06612593680620193, + "lr": 9.24974437627812e-06, + "objective/entropy": -211.03541564941406, + "objective/kl": 28.66901397705078, + "objective/non_score_reward": -1.4334505796432495, + "objective/rlhf_reward": -4.39216672471109, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 4.856602191925049, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6796875, + "step": 1174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004348754882812 + }, + { + "episode": 18816, + "epoch": 0.11273681561634971, + "loss/policy_avg": 0.17440900206565857, + "lr": 9.249105316973417e-06, + "objective/entropy": -233.4525146484375, + "objective/kl": 26.882205963134766, + "objective/non_score_reward": -1.344110369682312, + "objective/rlhf_reward": -3.9978394890702784, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 91.22392272949219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 1175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9991576671600342 + }, + { + "episode": 18832, + "epoch": 0.11283268025547927, + "loss/policy_avg": 0.5238691568374634, + "lr": 9.248466257668712e-06, + "objective/entropy": -173.6719970703125, + "objective/kl": 34.459197998046875, + "objective/non_score_reward": -1.7229597568511963, + "objective/rlhf_reward": -5.335579781737879, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 25.83188247680664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 1176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986010789871216 + }, + { + "episode": 18848, + "epoch": 0.11292854489460881, + "loss/policy_avg": -0.12078897655010223, + "lr": 9.247827198364009e-06, + "objective/entropy": -134.12008666992188, + "objective/kl": 34.92095184326172, + "objective/non_score_reward": -1.7460476160049438, + "objective/rlhf_reward": -5.159361715587686, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 7.610663414001465, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.751953125, + "step": 1177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0009992122650146 + }, + { + "episode": 18864, + "epoch": 0.11302440953373837, + "loss/policy_avg": 0.3817252516746521, + "lr": 9.247188139059305e-06, + "objective/entropy": -96.26307678222656, + "objective/kl": 44.49664306640625, + "objective/non_score_reward": -2.224832057952881, + "objective/rlhf_reward": -8.899328708648682, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.693860054016113, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.80859375, + "step": 1178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.997776985168457 + }, + { + "episode": 18880, + "epoch": 0.11312027417286791, + "loss/policy_avg": 0.07123968750238419, + "lr": 9.246549079754602e-06, + "objective/entropy": -199.199951171875, + "objective/kl": 27.166889190673828, + "objective/non_score_reward": -1.358344554901123, + "objective/rlhf_reward": -3.98277972182785, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 8.361668586730957, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.587890625, + "step": 1179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971075057983398 + }, + { + "episode": 18896, + "epoch": 0.11321613881199746, + "loss/policy_avg": 0.14846912026405334, + "lr": 9.2459100204499e-06, + "objective/entropy": -175.1884765625, + "objective/kl": 31.658098220825195, + "objective/non_score_reward": -1.5829048156738281, + "objective/rlhf_reward": -4.6697597555523975, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 0.9070639610290527, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 1180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0005788803100586 + }, + { + "episode": 18912, + "epoch": 0.113312003451127, + "loss/policy_avg": 0.3216549754142761, + "lr": 9.245270961145194e-06, + "objective/entropy": -182.9542236328125, + "objective/kl": 31.30569839477539, + "objective/non_score_reward": -1.5652849674224854, + "objective/rlhf_reward": -4.70488056441839, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 14.701448440551758, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.619140625, + "step": 1181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980485439300537 + }, + { + "episode": 18928, + "epoch": 0.11340786809025656, + "loss/policy_avg": 0.4251779019832611, + "lr": 9.244631901840491e-06, + "objective/entropy": -195.88975524902344, + "objective/kl": 28.441465377807617, + "objective/non_score_reward": -1.4220733642578125, + "objective/rlhf_reward": -4.0841732359567455, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 14.071691513061523, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.623046875, + "step": 1182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986789226531982 + }, + { + "episode": 18944, + "epoch": 0.1135037327293861, + "loss/policy_avg": 0.02354581654071808, + "lr": 9.243992842535788e-06, + "objective/entropy": -164.94105529785156, + "objective/kl": 30.754886627197266, + "objective/non_score_reward": -1.537744402885437, + "objective/rlhf_reward": -3.227258478046629, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 0.801190972328186, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.669921875, + "step": 1183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002558708190918 + }, + { + "episode": 18960, + "epoch": 0.11359959736851566, + "loss/policy_avg": 0.04112057387828827, + "lr": 9.243353783231085e-06, + "objective/entropy": -246.19515991210938, + "objective/kl": 34.75521469116211, + "objective/non_score_reward": -1.7377607822418213, + "objective/rlhf_reward": -5.609407058268218, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 22.861713409423828, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.66015625, + "step": 1184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9987796545028687 + }, + { + "episode": 18976, + "epoch": 0.1136954620076452, + "loss/policy_avg": 0.21404039859771729, + "lr": 9.242714723926382e-06, + "objective/entropy": -209.3376922607422, + "objective/kl": 35.15364074707031, + "objective/non_score_reward": -1.7576820850372314, + "objective/rlhf_reward": -5.083317349629338, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 45.4697265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.650390625, + "step": 1185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996396541595459 + }, + { + "episode": 18992, + "epoch": 0.11379132664677476, + "loss/policy_avg": -0.016785871237516403, + "lr": 9.242075664621679e-06, + "objective/entropy": -135.11508178710938, + "objective/kl": 44.11357879638672, + "objective/non_score_reward": -2.205678939819336, + "objective/rlhf_reward": -7.160855894506561, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 20.996137619018555, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.609375, + "step": 1186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9976164102554321 + }, + { + "episode": 19008, + "epoch": 0.1138871912859043, + "loss/policy_avg": -0.0332149937748909, + "lr": 9.241436605316974e-06, + "objective/entropy": -85.4975814819336, + "objective/kl": 33.72305679321289, + "objective/non_score_reward": -1.6861528158187866, + "objective/rlhf_reward": -2.3446113824844357, + "objective/scores": 1.1, + "policy/approxkl_avg": 115.56517028808594, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.849609375, + "step": 1187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9999394416809082 + }, + { + "episode": 19024, + "epoch": 0.11398305592503385, + "loss/policy_avg": 0.10150502622127533, + "lr": 9.240797546012271e-06, + "objective/entropy": -228.79638671875, + "objective/kl": 24.752819061279297, + "objective/non_score_reward": -1.2376409769058228, + "objective/rlhf_reward": -4.9505637884140015, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5263676643371582, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.517578125, + "step": 1188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0000452995300293 + }, + { + "episode": 19040, + "epoch": 0.1140789205641634, + "loss/policy_avg": 1.6640098094940186, + "lr": 9.240158486707568e-06, + "objective/entropy": -222.82388305664062, + "objective/kl": 37.1962890625, + "objective/non_score_reward": -1.8598144054412842, + "objective/rlhf_reward": -5.958305004055857, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 26.163982391357422, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.583984375, + "step": 1189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.998447299003601 + }, + { + "episode": 19056, + "epoch": 0.11417478520329295, + "loss/policy_avg": 0.24511002004146576, + "lr": 9.239519427402863e-06, + "objective/entropy": -169.49942016601562, + "objective/kl": 23.688583374023438, + "objective/non_score_reward": -1.1844291687011719, + "objective/rlhf_reward": -3.0758574656849964, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 25.747421264648438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.83984375, + "step": 1190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997087001800537 + }, + { + "episode": 19072, + "epoch": 0.1142706498424225, + "loss/policy_avg": 0.05934782326221466, + "lr": 9.23888036809816e-06, + "objective/entropy": -195.64088439941406, + "objective/kl": 33.113624572753906, + "objective/non_score_reward": -1.6556813716888428, + "objective/rlhf_reward": -5.172127108188018, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 15.268495559692383, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.765625, + "step": 1191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.998260736465454 + }, + { + "episode": 19088, + "epoch": 0.11436651448155205, + "loss/policy_avg": 0.15776914358139038, + "lr": 9.238241308793457e-06, + "objective/entropy": -227.42486572265625, + "objective/kl": 30.864715576171875, + "objective/non_score_reward": -1.5432357788085938, + "objective/rlhf_reward": -4.722344736667022, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 2.075873613357544, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.703125, + "step": 1192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991707801818848 + }, + { + "episode": 19104, + "epoch": 0.11446237912068159, + "loss/policy_avg": 0.07561061531305313, + "lr": 9.237602249488754e-06, + "objective/entropy": -201.01284790039062, + "objective/kl": 36.205970764160156, + "objective/non_score_reward": -1.8102984428405762, + "objective/rlhf_reward": -5.790595392794952, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 0.3898843228816986, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.640625, + "step": 1193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0015416145324707 + }, + { + "episode": 19120, + "epoch": 0.11455824375981115, + "loss/policy_avg": -0.5701497793197632, + "lr": 9.236963190184049e-06, + "objective/entropy": -103.09819030761719, + "objective/kl": 30.238616943359375, + "objective/non_score_reward": -1.5119309425354004, + "objective/rlhf_reward": -4.56677112263, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 4.551431655883789, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.59375, + "step": 1194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.004034996032715 + }, + { + "episode": 19136, + "epoch": 0.11465410839894069, + "loss/policy_avg": 0.46027839183807373, + "lr": 9.236324130879346e-06, + "objective/entropy": -208.53213500976562, + "objective/kl": 33.96599197387695, + "objective/non_score_reward": -1.6982996463775635, + "objective/rlhf_reward": -4.968369956287455, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 32.79998016357422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.76953125, + "step": 1195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9957493543624878 + }, + { + "episode": 19152, + "epoch": 0.11474997303807025, + "loss/policy_avg": 0.7104591131210327, + "lr": 9.235685071574642e-06, + "objective/entropy": -161.5511932373047, + "objective/kl": 21.182106018066406, + "objective/non_score_reward": -1.0591052770614624, + "objective/rlhf_reward": -2.7554683117226357, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 113.69915771484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.810546875, + "step": 1196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9965519905090332 + }, + { + "episode": 19168, + "epoch": 0.11484583767719979, + "loss/policy_avg": -0.09501040726900101, + "lr": 9.23504601226994e-06, + "objective/entropy": -149.43408203125, + "objective/kl": 36.27130126953125, + "objective/non_score_reward": -1.8135650157928467, + "objective/rlhf_reward": -7.254260301589966, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.328031539916992, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62109375, + "step": 1197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0035271644592285 + }, + { + "episode": 19184, + "epoch": 0.11494170231632934, + "loss/policy_avg": 0.566871702671051, + "lr": 9.234406952965236e-06, + "objective/entropy": -217.4463653564453, + "objective/kl": 29.27811050415039, + "objective/non_score_reward": -1.4639055728912354, + "objective/rlhf_reward": -4.51398654869142, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 53.84545135498047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58203125, + "step": 1198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9971461296081543 + }, + { + "episode": 19200, + "epoch": 0.11503756695545889, + "loss/policy_avg": 0.048794396221637726, + "lr": 9.233767893660533e-06, + "objective/entropy": -174.531005859375, + "objective/kl": 29.087738037109375, + "objective/non_score_reward": -1.4543869495391846, + "objective/rlhf_reward": -2.893828962684843, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.782432556152344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.791015625, + "step": 1199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999170303344727 + }, + { + "episode": 19216, + "epoch": 0.11513343159458844, + "loss/policy_avg": -0.18217583000659943, + "lr": 9.233128834355828e-06, + "objective/entropy": -135.63037109375, + "objective/kl": 40.30628967285156, + "objective/non_score_reward": -2.0153145790100098, + "objective/rlhf_reward": -6.545486414226231, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 8.027623176574707, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.75, + "step": 1200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.00277042388916 + }, + { + "episode": 19232, + "epoch": 0.11522929623371798, + "loss/policy_avg": 0.19046634435653687, + "lr": 9.232489775051125e-06, + "objective/entropy": -245.74639892578125, + "objective/kl": 29.675251007080078, + "objective/non_score_reward": -1.483762502670288, + "objective/rlhf_reward": -4.575800263617916, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.7879266738891602, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.61328125, + "step": 1201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.00600528717041 + }, + { + "episode": 19248, + "epoch": 0.11532516087284754, + "loss/policy_avg": 0.21539102494716644, + "lr": 9.231850715746422e-06, + "objective/entropy": -225.9239044189453, + "objective/kl": 28.502708435058594, + "objective/non_score_reward": -1.4251353740692139, + "objective/rlhf_reward": -4.144282071796015, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 23.979963302612305, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.712890625, + "step": 1202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997289180755615 + }, + { + "episode": 19264, + "epoch": 0.11542102551197708, + "loss/policy_avg": 0.04965958744287491, + "lr": 9.231211656441719e-06, + "objective/entropy": -216.04248046875, + "objective/kl": 31.314760208129883, + "objective/non_score_reward": -1.5657379627227783, + "objective/rlhf_reward": -4.1402458570161205, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 8.937564849853516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5234375, + "step": 1203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976413249969482 + }, + { + "episode": 19280, + "epoch": 0.11551689015110664, + "loss/policy_avg": 0.02601933479309082, + "lr": 9.230572597137016e-06, + "objective/entropy": -148.66250610351562, + "objective/kl": 32.67079162597656, + "objective/non_score_reward": -1.6335396766662598, + "objective/rlhf_reward": -5.018386685641941, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 18.645111083984375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.79296875, + "step": 1204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.004350185394287 + }, + { + "episode": 19296, + "epoch": 0.11561275479023618, + "loss/policy_avg": 1.4007536172866821, + "lr": 9.229933537832311e-06, + "objective/entropy": -258.147705078125, + "objective/kl": 34.21760559082031, + "objective/non_score_reward": -1.7108802795410156, + "objective/rlhf_reward": -6.843520998954773, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.961824417114258, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.619140625, + "step": 1205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0000836849212646 + }, + { + "episode": 19312, + "epoch": 0.11570861942936574, + "loss/policy_avg": -0.39194512367248535, + "lr": 9.229294478527608e-06, + "objective/entropy": -100.05964660644531, + "objective/kl": 36.88145065307617, + "objective/non_score_reward": -1.8440725803375244, + "objective/rlhf_reward": -5.551461453708718, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 8.472518920898438, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.599609375, + "step": 1206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0011942386627197 + }, + { + "episode": 19328, + "epoch": 0.11580448406849528, + "loss/policy_avg": 0.31982097029685974, + "lr": 9.228655419222905e-06, + "objective/entropy": -219.31304931640625, + "objective/kl": 38.748992919921875, + "objective/non_score_reward": -1.9374498128890991, + "objective/rlhf_reward": -3.349799251556396, + "objective/scores": 1.1, + "policy/approxkl_avg": 78.4788818359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.701171875, + "step": 1207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9975740909576416 + }, + { + "episode": 19344, + "epoch": 0.11590034870762483, + "loss/policy_avg": 0.28548291325569153, + "lr": 9.228016359918202e-06, + "objective/entropy": -157.26446533203125, + "objective/kl": 40.80043029785156, + "objective/non_score_reward": -2.0400216579437256, + "objective/rlhf_reward": -6.335257883342813, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 8.33885383605957, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.767578125, + "step": 1208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999432563781738 + }, + { + "episode": 19360, + "epoch": 0.11599621334675438, + "loss/policy_avg": 0.03618919104337692, + "lr": 9.227377300613499e-06, + "objective/entropy": -179.19644165039062, + "objective/kl": 33.20772933959961, + "objective/non_score_reward": -1.6603864431381226, + "objective/rlhf_reward": -4.908212439219157, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 4.311634540557861, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 1209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9987711906433105 + }, + { + "episode": 19376, + "epoch": 0.11609207798588393, + "loss/policy_avg": 0.15800103545188904, + "lr": 9.226738241308795e-06, + "objective/entropy": -270.1763916015625, + "objective/kl": 29.480056762695312, + "objective/non_score_reward": -1.4740028381347656, + "objective/rlhf_reward": -4.570498857527895, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 2.025053024291992, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.603515625, + "step": 1210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997854232788086 + }, + { + "episode": 19392, + "epoch": 0.11618794262501349, + "loss/policy_avg": 0.08228084444999695, + "lr": 9.22609918200409e-06, + "objective/entropy": -265.70428466796875, + "objective/kl": 27.694522857666016, + "objective/non_score_reward": -1.3847262859344482, + "objective/rlhf_reward": -4.1389049053192135, + "objective/scores": 0.35, + "policy/approxkl_avg": 11.403460502624512, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.546875, + "step": 1211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9995503425598145 + }, + { + "episode": 19408, + "epoch": 0.11628380726414303, + "loss/policy_avg": 0.24947790801525116, + "lr": 9.225460122699387e-06, + "objective/entropy": -214.40487670898438, + "objective/kl": 36.13543701171875, + "objective/non_score_reward": -1.8067721128463745, + "objective/rlhf_reward": -7.227088212966919, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.953628540039062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.603515625, + "step": 1212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9974710941314697 + }, + { + "episode": 19424, + "epoch": 0.11637967190327259, + "loss/policy_avg": 0.25788062810897827, + "lr": 9.224821063394683e-06, + "objective/entropy": -189.17974853515625, + "objective/kl": 29.4897518157959, + "objective/non_score_reward": -1.4744876623153687, + "objective/rlhf_reward": -4.29383042818697, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.7698781490325928, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58984375, + "step": 1213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9973881244659424 + }, + { + "episode": 19440, + "epoch": 0.11647553654240213, + "loss/policy_avg": -0.10094030201435089, + "lr": 9.22418200408998e-06, + "objective/entropy": -178.26290893554688, + "objective/kl": 36.244503021240234, + "objective/non_score_reward": -1.812225103378296, + "objective/rlhf_reward": -5.644780430857258, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 3.1554298400878906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.673828125, + "step": 1214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0035483837127686 + }, + { + "episode": 19456, + "epoch": 0.11657140118153168, + "loss/policy_avg": -0.2695544958114624, + "lr": 9.223542944785276e-06, + "objective/entropy": -224.8712158203125, + "objective/kl": 32.469635009765625, + "objective/non_score_reward": -1.6234817504882812, + "objective/rlhf_reward": -3.57020804727194, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 1.8041396141052246, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.65234375, + "step": 1215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0009067058563232 + }, + { + "episode": 19472, + "epoch": 0.11666726582066123, + "loss/policy_avg": 0.19596442580223083, + "lr": 9.222903885480573e-06, + "objective/entropy": -216.5953369140625, + "objective/kl": 32.196380615234375, + "objective/non_score_reward": -1.6098190546035767, + "objective/rlhf_reward": -5.039275979995727, + "objective/scores": 0.35, + "policy/approxkl_avg": 3.0333876609802246, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 1216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0020110607147217 + }, + { + "episode": 19488, + "epoch": 0.11676313045979078, + "loss/policy_avg": -0.0265303086489439, + "lr": 9.22226482617587e-06, + "objective/entropy": -177.5919189453125, + "objective/kl": 27.259849548339844, + "objective/non_score_reward": -1.362992525100708, + "objective/rlhf_reward": -2.528251205326292, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.3001770973205566, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.611328125, + "step": 1217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996999502182007 + }, + { + "episode": 19504, + "epoch": 0.11685899509892032, + "loss/policy_avg": 0.21211574971675873, + "lr": 9.221625766871165e-06, + "objective/entropy": -239.07907104492188, + "objective/kl": 20.40753936767578, + "objective/non_score_reward": -1.0203769207000732, + "objective/rlhf_reward": -2.681507921218872, + "objective/scores": 0.35, + "policy/approxkl_avg": 2.7712390422821045, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.564453125, + "step": 1218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0028648376464844 + }, + { + "episode": 19520, + "epoch": 0.11695485973804988, + "loss/policy_avg": 0.07235918194055557, + "lr": 9.220986707566462e-06, + "objective/entropy": -152.53878784179688, + "objective/kl": 21.97917366027832, + "objective/non_score_reward": -1.0989587306976318, + "objective/rlhf_reward": -1.4721155508768287, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 3.2449498176574707, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 1219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997578501701355 + }, + { + "episode": 19536, + "epoch": 0.11705072437717942, + "loss/policy_avg": 0.5660937428474426, + "lr": 9.220347648261759e-06, + "objective/entropy": -249.38014221191406, + "objective/kl": 41.259254455566406, + "objective/non_score_reward": -2.062962532043457, + "objective/rlhf_reward": -6.873248794165951, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 4.481976509094238, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.66796875, + "step": 1220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982693195343018 + }, + { + "episode": 19552, + "epoch": 0.11714658901630898, + "loss/policy_avg": 0.01150442287325859, + "lr": 9.219708588957056e-06, + "objective/entropy": -215.35882568359375, + "objective/kl": 34.312686920166016, + "objective/non_score_reward": -1.7156343460083008, + "objective/rlhf_reward": -5.346765959056553, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 17.738174438476562, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.69140625, + "step": 1221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9997079372406006 + }, + { + "episode": 19568, + "epoch": 0.11724245365543852, + "loss/policy_avg": -0.059517666697502136, + "lr": 9.219069529652353e-06, + "objective/entropy": -210.27809143066406, + "objective/kl": 27.43333625793457, + "objective/non_score_reward": -1.3716667890548706, + "objective/rlhf_reward": -4.005714538510203, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 16.074115753173828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 1222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001340389251709 + }, + { + "episode": 19584, + "epoch": 0.11733831829456808, + "loss/policy_avg": -0.15733516216278076, + "lr": 9.21843047034765e-06, + "objective/entropy": -235.85507202148438, + "objective/kl": 28.09206199645996, + "objective/non_score_reward": -1.4046030044555664, + "objective/rlhf_reward": -4.167813996882781, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 98.53274536132812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.63671875, + "step": 1223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0014100074768066 + }, + { + "episode": 19600, + "epoch": 0.11743418293369762, + "loss/policy_avg": 0.035943709313869476, + "lr": 9.217791411042945e-06, + "objective/entropy": -244.24017333984375, + "objective/kl": 37.203941345214844, + "objective/non_score_reward": -1.8601970672607422, + "objective/rlhf_reward": -6.115275297194643, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 3.5133166313171387, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.697265625, + "step": 1224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994046688079834 + }, + { + "episode": 19616, + "epoch": 0.11753004757282717, + "loss/policy_avg": 0.1306331604719162, + "lr": 9.217152351738242e-06, + "objective/entropy": -190.14393615722656, + "objective/kl": 33.84905242919922, + "objective/non_score_reward": -1.6924527883529663, + "objective/rlhf_reward": -5.254039132388767, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 3.3994088172912598, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.705078125, + "step": 1225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9983464479446411 + }, + { + "episode": 19632, + "epoch": 0.11762591221195672, + "loss/policy_avg": 0.0009730234742164612, + "lr": 9.216513292433539e-06, + "objective/entropy": -216.55715942382812, + "objective/kl": 30.103256225585938, + "objective/non_score_reward": -1.5051627159118652, + "objective/rlhf_reward": -3.8979449889817577, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.51433527469635, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 1226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0015969276428223 + }, + { + "episode": 19648, + "epoch": 0.11772177685108627, + "loss/policy_avg": 0.16437333822250366, + "lr": 9.215874233128836e-06, + "objective/entropy": -255.0314178466797, + "objective/kl": 45.41230010986328, + "objective/non_score_reward": -2.2706151008605957, + "objective/rlhf_reward": -7.478340182367878, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 13.10407829284668, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.693359375, + "step": 1227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9988343715667725 + }, + { + "episode": 19664, + "epoch": 0.11781764149021581, + "loss/policy_avg": 0.0678139179944992, + "lr": 9.215235173824132e-06, + "objective/entropy": -190.25567626953125, + "objective/kl": 31.204730987548828, + "objective/non_score_reward": -1.5602366924285889, + "objective/rlhf_reward": -4.725174867900547, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 0.9944963455200195, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.640625, + "step": 1228, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.003727436065674 + }, + { + "episode": 19680, + "epoch": 0.11791350612934537, + "loss/policy_avg": 0.10750436782836914, + "lr": 9.21459611451943e-06, + "objective/entropy": -212.99404907226562, + "objective/kl": 31.576601028442383, + "objective/non_score_reward": -1.5788300037384033, + "objective/rlhf_reward": -3.391601239086363, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.086095333099365, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71875, + "step": 1229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975237846374512 + }, + { + "episode": 19696, + "epoch": 0.11800937076847491, + "loss/policy_avg": 0.26910707354545593, + "lr": 9.213957055214725e-06, + "objective/entropy": -264.12017822265625, + "objective/kl": 27.552576065063477, + "objective/non_score_reward": -1.3776288032531738, + "objective/rlhf_reward": -3.7771820584932962, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 14.264134407043457, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 1230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9975985288619995 + }, + { + "episode": 19712, + "epoch": 0.11810523540760447, + "loss/policy_avg": 0.09155163168907166, + "lr": 9.213317995910021e-06, + "objective/entropy": -141.91424560546875, + "objective/kl": 32.08643341064453, + "objective/non_score_reward": -1.6043215990066528, + "objective/rlhf_reward": -4.5924574091759425, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 0.6272682547569275, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5390625, + "step": 1231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0007212162017822 + }, + { + "episode": 19728, + "epoch": 0.11820110004673401, + "loss/policy_avg": 0.028797071427106857, + "lr": 9.212678936605318e-06, + "objective/entropy": -301.3397216796875, + "objective/kl": 29.216651916503906, + "objective/non_score_reward": -1.4608327150344849, + "objective/rlhf_reward": -4.287071674075678, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 11.006927490234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 1232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975908994674683 + }, + { + "episode": 19744, + "epoch": 0.11829696468586356, + "loss/policy_avg": 0.12966430187225342, + "lr": 9.212039877300615e-06, + "objective/entropy": -220.30935668945312, + "objective/kl": 42.5980224609375, + "objective/non_score_reward": -2.12990140914917, + "objective/rlhf_reward": -6.396899165884529, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 10.670743942260742, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.68359375, + "step": 1233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994142055511475 + }, + { + "episode": 19760, + "epoch": 0.1183928293249931, + "loss/policy_avg": 0.3277433514595032, + "lr": 9.21140081799591e-06, + "objective/entropy": -144.93858337402344, + "objective/kl": 34.81742858886719, + "objective/non_score_reward": -1.740871548652649, + "objective/rlhf_reward": -5.447714292796787, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 144.9310302734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.67578125, + "step": 1234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9978101253509521 + }, + { + "episode": 19776, + "epoch": 0.11848869396412266, + "loss/policy_avg": 0.6404599547386169, + "lr": 9.210761758691207e-06, + "objective/entropy": -259.30499267578125, + "objective/kl": 39.584476470947266, + "objective/non_score_reward": -1.9792238473892212, + "objective/rlhf_reward": -6.18356229464213, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 9.638875961303711, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 1235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9967763423919678 + }, + { + "episode": 19792, + "epoch": 0.1185845586032522, + "loss/policy_avg": 0.20158489048480988, + "lr": 9.210122699386504e-06, + "objective/entropy": -137.64532470703125, + "objective/kl": 37.92731475830078, + "objective/non_score_reward": -1.8963658809661865, + "objective/rlhf_reward": -6.22621377680151, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 3.0997841358184814, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.603515625, + "step": 1236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9978444576263428 + }, + { + "episode": 19808, + "epoch": 0.11868042324238176, + "loss/policy_avg": 0.013250820338726044, + "lr": 9.2094836400818e-06, + "objective/entropy": -204.8336944580078, + "objective/kl": 25.06024169921875, + "objective/non_score_reward": -1.2530121803283691, + "objective/rlhf_reward": -3.4962769387089576, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 3.133713960647583, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.55859375, + "step": 1237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000356435775757 + }, + { + "episode": 19824, + "epoch": 0.1187762878815113, + "loss/policy_avg": 0.23657885193824768, + "lr": 9.208844580777096e-06, + "objective/entropy": -257.93719482421875, + "objective/kl": 34.67414855957031, + "objective/non_score_reward": -1.7337074279785156, + "objective/rlhf_reward": -5.201496616999308, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 40.29893112182617, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 1238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974265098571777 + }, + { + "episode": 19840, + "epoch": 0.11887215252064086, + "loss/policy_avg": 1.0746341943740845, + "lr": 9.208205521472393e-06, + "objective/entropy": -137.0782928466797, + "objective/kl": 38.25480270385742, + "objective/non_score_reward": -1.9127401113510132, + "objective/rlhf_reward": -6.200362186045989, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 4.038956165313721, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 1239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000051736831665 + }, + { + "episode": 19856, + "epoch": 0.1189680171597704, + "loss/policy_avg": 0.2902667224407196, + "lr": 9.20756646216769e-06, + "objective/entropy": -241.73587036132812, + "objective/kl": 32.18947982788086, + "objective/non_score_reward": -1.6094739437103271, + "objective/rlhf_reward": -4.613067384037088, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 10.234606742858887, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.615234375, + "step": 1240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9977645874023438 + }, + { + "episode": 19872, + "epoch": 0.11906388179889996, + "loss/policy_avg": 0.02851104736328125, + "lr": 9.206927402862987e-06, + "objective/entropy": -160.71896362304688, + "objective/kl": 47.23845291137695, + "objective/non_score_reward": -2.3619225025177, + "objective/rlhf_reward": -7.966737869198679, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 35.956363677978516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.626953125, + "step": 1241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000300645828247 + }, + { + "episode": 19888, + "epoch": 0.1191597464380295, + "loss/policy_avg": 0.2741260528564453, + "lr": 9.206288343558284e-06, + "objective/entropy": -148.2718963623047, + "objective/kl": 38.57466125488281, + "objective/non_score_reward": -1.9287331104278564, + "objective/rlhf_reward": -6.264334539981231, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 3.452293872833252, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.59375, + "step": 1242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0030832290649414 + }, + { + "episode": 19904, + "epoch": 0.11925561107715905, + "loss/policy_avg": 0.5994369387626648, + "lr": 9.205649284253579e-06, + "objective/entropy": -123.61450958251953, + "objective/kl": 36.576622009277344, + "objective/non_score_reward": -1.8288313150405884, + "objective/rlhf_reward": -5.973689547091155, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 12.081830024719238, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.626953125, + "step": 1243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998105764389038 + }, + { + "episode": 19920, + "epoch": 0.1193514757162886, + "loss/policy_avg": -0.38412266969680786, + "lr": 9.205010224948876e-06, + "objective/entropy": -250.1025848388672, + "objective/kl": 33.524559020996094, + "objective/non_score_reward": -1.6762280464172363, + "objective/rlhf_reward": -2.3049123644828793, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.83417510986328, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.736328125, + "step": 1244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.003070592880249 + }, + { + "episode": 19936, + "epoch": 0.11944734035541815, + "loss/policy_avg": 2.035850763320923, + "lr": 9.204371165644173e-06, + "objective/entropy": -190.210693359375, + "objective/kl": 26.431785583496094, + "objective/non_score_reward": -1.321589469909668, + "objective/rlhf_reward": -3.6244980148678883, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 25.93347930908203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.736328125, + "step": 1245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.997817039489746 + }, + { + "episode": 19952, + "epoch": 0.1195432049945477, + "loss/policy_avg": 1.105665683746338, + "lr": 9.20373210633947e-06, + "objective/entropy": -201.83714294433594, + "objective/kl": 35.3839225769043, + "objective/non_score_reward": -1.7691962718963623, + "objective/rlhf_reward": -5.698182680693966, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 6.776236534118652, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.857421875, + "step": 1246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000749349594116 + }, + { + "episode": 19968, + "epoch": 0.11963906963367725, + "loss/policy_avg": -0.04859113693237305, + "lr": 9.203093047034766e-06, + "objective/entropy": -258.0498046875, + "objective/kl": 28.967775344848633, + "objective/non_score_reward": -1.4483888149261475, + "objective/rlhf_reward": -3.393555378913879, + "objective/scores": 0.6, + "policy/approxkl_avg": 2.895939826965332, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.681640625, + "step": 1247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000812292098999 + }, + { + "episode": 19984, + "epoch": 0.11973493427280679, + "loss/policy_avg": 0.644065797328949, + "lr": 9.202453987730062e-06, + "objective/entropy": -258.9081726074219, + "objective/kl": 38.442054748535156, + "objective/non_score_reward": -1.922102689743042, + "objective/rlhf_reward": -6.264578659732905, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 4.295760154724121, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.67578125, + "step": 1248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992396831512451 + }, + { + "episode": 20000, + "epoch": 0.11983079891193635, + "loss/policy_avg": 0.9093930125236511, + "lr": 9.201814928425358e-06, + "objective/entropy": -194.09771728515625, + "objective/kl": 41.89799499511719, + "objective/non_score_reward": -2.0948996543884277, + "objective/rlhf_reward": -6.432187746243413, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 8.594277381896973, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 1249, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973087310791016 + }, + { + "episode": 20016, + "epoch": 0.11992666355106589, + "loss/policy_avg": 0.09421442449092865, + "lr": 9.201175869120655e-06, + "objective/entropy": -276.9185485839844, + "objective/kl": 33.968955993652344, + "objective/non_score_reward": -1.6984477043151855, + "objective/rlhf_reward": -5.23753175040777, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.0292129516601562, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.736328125, + "step": 1250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9998764991760254 + }, + { + "episode": 20032, + "epoch": 0.12002252819019545, + "loss/policy_avg": 0.053171977400779724, + "lr": 9.200536809815952e-06, + "objective/entropy": -104.78028869628906, + "objective/kl": 29.34747314453125, + "objective/non_score_reward": -1.4673736095428467, + "objective/rlhf_reward": -4.388541939671397, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 16.4470157623291, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.76171875, + "step": 1251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982876777648926 + }, + { + "episode": 20048, + "epoch": 0.12011839282932499, + "loss/policy_avg": 0.21370352804660797, + "lr": 9.199897750511249e-06, + "objective/entropy": -211.57241821289062, + "objective/kl": 34.70026779174805, + "objective/non_score_reward": -1.735013484954834, + "objective/rlhf_reward": -5.561452009765011, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 1.8732821941375732, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 1252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002889394760132 + }, + { + "episode": 20064, + "epoch": 0.12021425746845454, + "loss/policy_avg": -0.06851379573345184, + "lr": 9.199258691206546e-06, + "objective/entropy": -247.22412109375, + "objective/kl": 24.82408905029297, + "objective/non_score_reward": -1.2412043809890747, + "objective/rlhf_reward": -4.964817762374878, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.531271457672119, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.59375, + "step": 1253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0064773559570312 + }, + { + "episode": 20080, + "epoch": 0.12031012210758409, + "loss/policy_avg": 0.9840347766876221, + "lr": 9.198619631901841e-06, + "objective/entropy": -122.53502655029297, + "objective/kl": 40.514495849609375, + "objective/non_score_reward": -2.0257248878479004, + "objective/rlhf_reward": -6.652301590056762, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 12.03805923461914, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6640625, + "step": 1254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9971917867660522 + }, + { + "episode": 20096, + "epoch": 0.12040598674671364, + "loss/policy_avg": 0.18231819570064545, + "lr": 9.197980572597138e-06, + "objective/entropy": -241.79513549804688, + "objective/kl": 38.14476776123047, + "objective/non_score_reward": -1.907238483428955, + "objective/rlhf_reward": -6.250351646033627, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 3.30513334274292, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 1255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998397827148438 + }, + { + "episode": 20112, + "epoch": 0.12050185138584318, + "loss/policy_avg": 0.23248505592346191, + "lr": 9.197341513292433e-06, + "objective/entropy": -235.57354736328125, + "objective/kl": 23.809890747070312, + "objective/non_score_reward": -1.1904945373535156, + "objective/rlhf_reward": -1.838259015918943, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 16.297555923461914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 1256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000349521636963 + }, + { + "episode": 20128, + "epoch": 0.12059771602497274, + "loss/policy_avg": 0.06544123589992523, + "lr": 9.19670245398773e-06, + "objective/entropy": -148.2562255859375, + "objective/kl": 42.84388732910156, + "objective/non_score_reward": -2.1421945095062256, + "objective/rlhf_reward": -7.144946057994929, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 11.275466918945312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.556640625, + "step": 1257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001687526702881 + }, + { + "episode": 20144, + "epoch": 0.12069358066410228, + "loss/policy_avg": -0.2555674612522125, + "lr": 9.196063394683027e-06, + "objective/entropy": -262.81939697265625, + "objective/kl": 37.51679229736328, + "objective/non_score_reward": -1.8758397102355957, + "objective/rlhf_reward": -5.380652429834877, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 5.35495662689209, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.658203125, + "step": 1258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0027287006378174 + }, + { + "episode": 20160, + "epoch": 0.12078944530323184, + "loss/policy_avg": -0.2871710956096649, + "lr": 9.195424335378324e-06, + "objective/entropy": -230.1177978515625, + "objective/kl": 37.040069580078125, + "objective/non_score_reward": -1.8520034551620483, + "objective/rlhf_reward": -7.408013701438904, + "objective/scores": 0.0, + "policy/approxkl_avg": 33.58317947387695, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.498046875, + "step": 1259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0012998580932617 + }, + { + "episode": 20176, + "epoch": 0.12088530994236138, + "loss/policy_avg": 0.29908883571624756, + "lr": 9.19478527607362e-06, + "objective/entropy": -172.86453247070312, + "objective/kl": 45.35060501098633, + "objective/non_score_reward": -2.2675304412841797, + "objective/rlhf_reward": -7.408261661947357, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 5.0800933837890625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.69921875, + "step": 1260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9999351501464844 + }, + { + "episode": 20192, + "epoch": 0.12098117458149094, + "loss/policy_avg": 0.25280916690826416, + "lr": 9.194146216768916e-06, + "objective/entropy": -241.52896118164062, + "objective/kl": 48.261566162109375, + "objective/non_score_reward": -2.4130783081054688, + "objective/rlhf_reward": -8.20171544990097, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 6.6377339363098145, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 1261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9999206066131592 + }, + { + "episode": 20208, + "epoch": 0.12107703922062048, + "loss/policy_avg": 0.3357711136341095, + "lr": 9.193507157464213e-06, + "objective/entropy": -187.2262725830078, + "objective/kl": 40.54385757446289, + "objective/non_score_reward": -2.0271928310394287, + "objective/rlhf_reward": -6.749521457885189, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 3.054933786392212, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.537109375, + "step": 1262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999709129333496 + }, + { + "episode": 20224, + "epoch": 0.12117290385975003, + "loss/policy_avg": 0.036879949271678925, + "lr": 9.19286809815951e-06, + "objective/entropy": -289.2770690917969, + "objective/kl": 36.072391510009766, + "objective/non_score_reward": -1.803619384765625, + "objective/rlhf_reward": -5.790645618637171, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 1.7514324188232422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 1263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9988840818405151 + }, + { + "episode": 20240, + "epoch": 0.12126876849887958, + "loss/policy_avg": 0.022309046238660812, + "lr": 9.192229038854807e-06, + "objective/entropy": -295.97265625, + "objective/kl": 34.17414093017578, + "objective/non_score_reward": -1.7087069749832153, + "objective/rlhf_reward": -5.509315047293825, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 2.405184507369995, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 1264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0027103424072266 + }, + { + "episode": 20256, + "epoch": 0.12136463313800913, + "loss/policy_avg": 0.11772053688764572, + "lr": 9.191589979550103e-06, + "objective/entropy": -216.94451904296875, + "objective/kl": 29.35517692565918, + "objective/non_score_reward": -1.4677588939666748, + "objective/rlhf_reward": -3.923624227719243, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 0.7640889883041382, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.63671875, + "step": 1265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0017471313476562 + }, + { + "episode": 20272, + "epoch": 0.12146049777713867, + "loss/policy_avg": 0.45337143540382385, + "lr": 9.1909509202454e-06, + "objective/entropy": -248.708984375, + "objective/kl": 25.6322021484375, + "objective/non_score_reward": -1.2816100120544434, + "objective/rlhf_reward": -3.7848046331697995, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 0.588313102722168, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6875, + "step": 1266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.001028299331665 + }, + { + "episode": 20288, + "epoch": 0.12155636241626823, + "loss/policy_avg": 0.4821030795574188, + "lr": 9.190311860940695e-06, + "objective/entropy": -246.07826232910156, + "objective/kl": 25.80655288696289, + "objective/non_score_reward": -1.2903276681900024, + "objective/rlhf_reward": -2.7613106727600094, + "objective/scores": 0.6, + "policy/approxkl_avg": 10.890335083007812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62109375, + "step": 1267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9990689754486084 + }, + { + "episode": 20304, + "epoch": 0.12165222705539779, + "loss/policy_avg": 0.28960275650024414, + "lr": 9.189672801635992e-06, + "objective/entropy": -265.9043273925781, + "objective/kl": 30.99881362915039, + "objective/non_score_reward": -1.5499407052993774, + "objective/rlhf_reward": -4.077056469694648, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 13.907394409179688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.716796875, + "step": 1268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9994606971740723 + }, + { + "episode": 20320, + "epoch": 0.12174809169452733, + "loss/policy_avg": 0.03770780563354492, + "lr": 9.18903374233129e-06, + "objective/entropy": -197.77639770507812, + "objective/kl": 29.34738540649414, + "objective/non_score_reward": -1.4673693180084229, + "objective/rlhf_reward": -4.1361438194910685, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 0.8509318828582764, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.701171875, + "step": 1269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0020077228546143 + }, + { + "episode": 20336, + "epoch": 0.12184395633365688, + "loss/policy_avg": 0.06795699894428253, + "lr": 9.188394683026586e-06, + "objective/entropy": -213.62989807128906, + "objective/kl": 31.280406951904297, + "objective/non_score_reward": -1.5640202760696411, + "objective/rlhf_reward": -4.930568132430238, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 8.444547653198242, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5703125, + "step": 1270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9978091716766357 + }, + { + "episode": 20352, + "epoch": 0.12193982097278643, + "loss/policy_avg": 0.43714067339897156, + "lr": 9.187755623721883e-06, + "objective/entropy": -243.02096557617188, + "objective/kl": 39.39186477661133, + "objective/non_score_reward": -1.9695932865142822, + "objective/rlhf_reward": -6.39742028992927, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 44.565582275390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 1271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9978324174880981 + }, + { + "episode": 20368, + "epoch": 0.12203568561191598, + "loss/policy_avg": -0.05719127878546715, + "lr": 9.187116564417178e-06, + "objective/entropy": -192.6077880859375, + "objective/kl": 32.60759735107422, + "objective/non_score_reward": -1.6303796768188477, + "objective/rlhf_reward": -5.097686667640773, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 5.910696983337402, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.572265625, + "step": 1272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.005018472671509 + }, + { + "episode": 20384, + "epoch": 0.12213155025104552, + "loss/policy_avg": 0.10466927289962769, + "lr": 9.186477505112475e-06, + "objective/entropy": -230.49244689941406, + "objective/kl": 27.570762634277344, + "objective/non_score_reward": -1.3785381317138672, + "objective/rlhf_reward": -3.9983805058323707, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 43.770347595214844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58984375, + "step": 1273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983129501342773 + }, + { + "episode": 20400, + "epoch": 0.12222741489017508, + "loss/policy_avg": 0.32652002573013306, + "lr": 9.185838445807772e-06, + "objective/entropy": -237.1881866455078, + "objective/kl": 35.98992919921875, + "objective/non_score_reward": -1.7994965314865112, + "objective/rlhf_reward": -5.872473273307008, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 3.772566795349121, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7265625, + "step": 1274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0020248889923096 + }, + { + "episode": 20416, + "epoch": 0.12232327952930462, + "loss/policy_avg": -0.041273415088653564, + "lr": 9.185199386503069e-06, + "objective/entropy": -249.03428649902344, + "objective/kl": 30.465728759765625, + "objective/non_score_reward": -1.5232863426208496, + "objective/rlhf_reward": -4.359812394777934, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 34.99227523803711, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.72265625, + "step": 1275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000488758087158 + }, + { + "episode": 20432, + "epoch": 0.12241914416843418, + "loss/policy_avg": 0.07646825909614563, + "lr": 9.184560327198366e-06, + "objective/entropy": -274.52752685546875, + "objective/kl": 28.405258178710938, + "objective/non_score_reward": -1.4202628135681152, + "objective/rlhf_reward": -4.321801745627804, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 0.582542896270752, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6015625, + "step": 1276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000093936920166 + }, + { + "episode": 20448, + "epoch": 0.12251500880756372, + "loss/policy_avg": 0.6634305119514465, + "lr": 9.183921267893663e-06, + "objective/entropy": -237.85279846191406, + "objective/kl": 37.730873107910156, + "objective/non_score_reward": -1.88654363155365, + "objective/rlhf_reward": -6.065221848900675, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 3.300227165222168, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 1277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9993836879730225 + }, + { + "episode": 20464, + "epoch": 0.12261087344669327, + "loss/policy_avg": 0.32286834716796875, + "lr": 9.183282208588958e-06, + "objective/entropy": -170.94064331054688, + "objective/kl": 35.21946716308594, + "objective/non_score_reward": -1.7609732151031494, + "objective/rlhf_reward": -2.643892979621887, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.075477600097656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69921875, + "step": 1278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9982943534851074 + }, + { + "episode": 20480, + "epoch": 0.12270673808582282, + "loss/policy_avg": -0.0007353071123361588, + "lr": 9.182643149284255e-06, + "objective/entropy": -208.5531005859375, + "objective/kl": 36.26404571533203, + "objective/non_score_reward": -1.8132022619247437, + "objective/rlhf_reward": -5.874206879226071, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 1.7173817157745361, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.51953125, + "step": 1279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9998679161071777 + }, + { + "episode": 20496, + "epoch": 0.12280260272495237, + "loss/policy_avg": 0.0016644150018692017, + "lr": 9.18200408997955e-06, + "objective/entropy": -276.2265930175781, + "objective/kl": 37.951438903808594, + "objective/non_score_reward": -1.8975720405578613, + "objective/rlhf_reward": -5.765459294590066, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 1.626516580581665, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6484375, + "step": 1280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0007171630859375 + }, + { + "episode": 20512, + "epoch": 0.12289846736408191, + "loss/policy_avg": 0.9792773723602295, + "lr": 9.181365030674847e-06, + "objective/entropy": -181.45407104492188, + "objective/kl": 47.48221969604492, + "objective/non_score_reward": -2.3741111755371094, + "objective/rlhf_reward": -8.117842295256953, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 7.093747138977051, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 1281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997685194015503 + }, + { + "episode": 20528, + "epoch": 0.12299433200321147, + "loss/policy_avg": 0.35386669635772705, + "lr": 9.180725971370144e-06, + "objective/entropy": -225.07867431640625, + "objective/kl": 32.99415588378906, + "objective/non_score_reward": -1.6497077941894531, + "objective/rlhf_reward": -5.174998839099971, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 36.31614685058594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7578125, + "step": 1282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9996511936187744 + }, + { + "episode": 20544, + "epoch": 0.12309019664234101, + "loss/policy_avg": 0.9949113130569458, + "lr": 9.18008691206544e-06, + "objective/entropy": -144.422119140625, + "objective/kl": 41.22947311401367, + "objective/non_score_reward": -2.061473846435547, + "objective/rlhf_reward": -6.886644804213924, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 29.33792495727539, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.70703125, + "step": 1283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9961674213409424 + }, + { + "episode": 20560, + "epoch": 0.12318606128147057, + "loss/policy_avg": 0.01584434136748314, + "lr": 9.179447852760737e-06, + "objective/entropy": -218.75259399414062, + "objective/kl": 29.35763168334961, + "objective/non_score_reward": -1.467881679534912, + "objective/rlhf_reward": -4.512276732657833, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 0.7336653470993042, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58203125, + "step": 1284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0014827251434326 + }, + { + "episode": 20576, + "epoch": 0.12328192592060011, + "loss/policy_avg": 0.1053546816110611, + "lr": 9.178808793456033e-06, + "objective/entropy": -253.1468963623047, + "objective/kl": 34.82318878173828, + "objective/non_score_reward": -1.741159439086914, + "objective/rlhf_reward": -4.5646381139755245, + "objective/scores": 0.6, + "policy/approxkl_avg": 2.2562737464904785, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.609375, + "step": 1285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9984190464019775 + }, + { + "episode": 20592, + "epoch": 0.12337779055972967, + "loss/policy_avg": 1.1642229557037354, + "lr": 9.17816973415133e-06, + "objective/entropy": -255.337646484375, + "objective/kl": 33.243751525878906, + "objective/non_score_reward": -1.6621875762939453, + "objective/rlhf_reward": -6.648750364780426, + "objective/scores": 0.0, + "policy/approxkl_avg": 38.7473030090332, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.775390625, + "step": 1286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.997887134552002 + }, + { + "episode": 20608, + "epoch": 0.12347365519885921, + "loss/policy_avg": -0.24089229106903076, + "lr": 9.177530674846626e-06, + "objective/entropy": -243.97262573242188, + "objective/kl": 25.183528900146484, + "objective/non_score_reward": -1.259176254272461, + "objective/rlhf_reward": -3.6128731562691607, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 9.022109031677246, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.642578125, + "step": 1287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.003718376159668 + }, + { + "episode": 20624, + "epoch": 0.12356951983798876, + "loss/policy_avg": 0.2587750554084778, + "lr": 9.176891615541923e-06, + "objective/entropy": -264.50152587890625, + "objective/kl": 47.71129608154297, + "objective/non_score_reward": -2.3855648040771484, + "objective/rlhf_reward": -7.142259335517883, + "objective/scores": 0.6, + "policy/approxkl_avg": 4.712902069091797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.607421875, + "step": 1288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9966511726379395 + }, + { + "episode": 20640, + "epoch": 0.1236653844771183, + "loss/policy_avg": 0.3948793411254883, + "lr": 9.17625255623722e-06, + "objective/entropy": -154.65003967285156, + "objective/kl": 40.509239196777344, + "objective/non_score_reward": -2.0254621505737305, + "objective/rlhf_reward": -6.7018486022949215, + "objective/scores": 0.35, + "policy/approxkl_avg": 3.50528621673584, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.533203125, + "step": 1289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000455617904663 + }, + { + "episode": 20656, + "epoch": 0.12376124911624786, + "loss/policy_avg": 0.20847059786319733, + "lr": 9.175613496932517e-06, + "objective/entropy": -233.9412078857422, + "objective/kl": 41.79835510253906, + "objective/non_score_reward": -2.0899178981781006, + "objective/rlhf_reward": -7.0180361776644276, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 8.814239501953125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.607421875, + "step": 1290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9992281198501587 + }, + { + "episode": 20672, + "epoch": 0.1238571137553774, + "loss/policy_avg": 0.34916895627975464, + "lr": 9.174974437627812e-06, + "objective/entropy": -225.4031982421875, + "objective/kl": 40.641937255859375, + "objective/non_score_reward": -2.0320968627929688, + "objective/rlhf_reward": -6.572128622737482, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 49.47692108154297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 1291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9976723194122314 + }, + { + "episode": 20688, + "epoch": 0.12395297839450696, + "loss/policy_avg": 2.849823474884033, + "lr": 9.174335378323109e-06, + "objective/entropy": -330.77435302734375, + "objective/kl": 23.790363311767578, + "objective/non_score_reward": -1.1895182132720947, + "objective/rlhf_reward": -3.307474712939605, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 46.90863800048828, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.603515625, + "step": 1292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.00309157371521 + }, + { + "episode": 20704, + "epoch": 0.1240488430336365, + "loss/policy_avg": 0.20439790189266205, + "lr": 9.173696319018406e-06, + "objective/entropy": -275.361328125, + "objective/kl": 32.65497589111328, + "objective/non_score_reward": -1.6327489614486694, + "objective/rlhf_reward": -2.1309958457946774, + "objective/scores": 1.1, + "policy/approxkl_avg": 3.454024314880371, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.591796875, + "step": 1293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9983692169189453 + }, + { + "episode": 20720, + "epoch": 0.12414470767276606, + "loss/policy_avg": 0.6102030277252197, + "lr": 9.173057259713703e-06, + "objective/entropy": -218.39520263671875, + "objective/kl": 27.34351348876953, + "objective/non_score_reward": -1.3671756982803345, + "objective/rlhf_reward": -3.9877501754120583, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 5.690610885620117, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.697265625, + "step": 1294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999359130859375 + }, + { + "episode": 20736, + "epoch": 0.1242405723118956, + "loss/policy_avg": 0.12826263904571533, + "lr": 9.172418200409e-06, + "objective/entropy": -288.58819580078125, + "objective/kl": 36.94340133666992, + "objective/non_score_reward": -1.847170114517212, + "objective/rlhf_reward": -5.9648483588295855, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 9.69528579711914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 1295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9996252059936523 + }, + { + "episode": 20752, + "epoch": 0.12433643695102516, + "loss/policy_avg": -0.08016486465930939, + "lr": 9.171779141104295e-06, + "objective/entropy": -226.99656677246094, + "objective/kl": 41.39111328125, + "objective/non_score_reward": -2.0695557594299316, + "objective/rlhf_reward": -5.878223037719726, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.6657519340515137, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.677734375, + "step": 1296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000385046005249 + }, + { + "episode": 20768, + "epoch": 0.1244323015901547, + "loss/policy_avg": 0.44178086519241333, + "lr": 9.171140081799592e-06, + "objective/entropy": -236.19082641601562, + "objective/kl": 32.86880111694336, + "objective/non_score_reward": -1.6434402465820312, + "objective/rlhf_reward": -5.232125094442992, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 3.834670066833496, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.701171875, + "step": 1297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988516569137573 + }, + { + "episode": 20784, + "epoch": 0.12452816622928425, + "loss/policy_avg": 0.5984504818916321, + "lr": 9.170501022494889e-06, + "objective/entropy": -274.540771484375, + "objective/kl": 29.187076568603516, + "objective/non_score_reward": -1.4593539237976074, + "objective/rlhf_reward": -4.478166067336483, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 2.234905242919922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73828125, + "step": 1298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.004179000854492 + }, + { + "episode": 20800, + "epoch": 0.1246240308684138, + "loss/policy_avg": -0.41880375146865845, + "lr": 9.169861963190185e-06, + "objective/entropy": -268.99920654296875, + "objective/kl": 33.32693862915039, + "objective/non_score_reward": -1.6663470268249512, + "objective/rlhf_reward": -4.265388345718383, + "objective/scores": 0.6, + "policy/approxkl_avg": 9.45730972290039, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.66796875, + "step": 1299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0022175312042236 + }, + { + "episode": 20816, + "epoch": 0.12471989550754335, + "loss/policy_avg": 0.5496609807014465, + "lr": 9.169222903885482e-06, + "objective/entropy": -217.79193115234375, + "objective/kl": 33.751773834228516, + "objective/non_score_reward": -1.6875885725021362, + "objective/rlhf_reward": -5.408718755751281, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 2.2236084938049316, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53125, + "step": 1300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001005172729492 + }, + { + "episode": 20832, + "epoch": 0.1248157601466729, + "loss/policy_avg": 0.017860662192106247, + "lr": 9.168583844580777e-06, + "objective/entropy": -255.07095336914062, + "objective/kl": 19.090106964111328, + "objective/non_score_reward": -0.9545053839683533, + "objective/rlhf_reward": -1.4180216252803803, + "objective/scores": 0.6, + "policy/approxkl_avg": 11.007080078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 1301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001258611679077 + }, + { + "episode": 20848, + "epoch": 0.12491162478580245, + "loss/policy_avg": 0.02041742019355297, + "lr": 9.167944785276074e-06, + "objective/entropy": -255.44552612304688, + "objective/kl": 40.95478057861328, + "objective/non_score_reward": -2.047739028930664, + "objective/rlhf_reward": -8.190956592559814, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.125746250152588, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5703125, + "step": 1302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999814748764038 + }, + { + "episode": 20864, + "epoch": 0.125007489424932, + "loss/policy_avg": 0.26476216316223145, + "lr": 9.167305725971371e-06, + "objective/entropy": -234.08668518066406, + "objective/kl": 31.28912353515625, + "objective/non_score_reward": -1.5644559860229492, + "objective/rlhf_reward": -4.310412774758275, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 26.78909683227539, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66015625, + "step": 1303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000152111053467 + }, + { + "episode": 20880, + "epoch": 0.12510335406406153, + "loss/policy_avg": 0.7206395864486694, + "lr": 9.166666666666666e-06, + "objective/entropy": -257.04144287109375, + "objective/kl": 32.617130279541016, + "objective/non_score_reward": -1.6308565139770508, + "objective/rlhf_reward": -4.967166869845942, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.7028008699417114, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.76171875, + "step": 1304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000716209411621 + }, + { + "episode": 20896, + "epoch": 0.1251992187031911, + "loss/policy_avg": 0.9150592088699341, + "lr": 9.166027607361963e-06, + "objective/entropy": -226.0206298828125, + "objective/kl": 28.190610885620117, + "objective/non_score_reward": -1.409530520439148, + "objective/rlhf_reward": -2.714403127075407, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.585868835449219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.578125, + "step": 1305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9991176128387451 + }, + { + "episode": 20912, + "epoch": 0.12529508334232065, + "loss/policy_avg": 0.6741877198219299, + "lr": 9.16538854805726e-06, + "objective/entropy": -244.3083953857422, + "objective/kl": 30.657371520996094, + "objective/non_score_reward": -1.5328686237335205, + "objective/rlhf_reward": -4.731474375724792, + "objective/scores": 0.35, + "policy/approxkl_avg": 5.305037498474121, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.642578125, + "step": 1306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9968469142913818 + }, + { + "episode": 20928, + "epoch": 0.1253909479814502, + "loss/policy_avg": 0.09786906093358994, + "lr": 9.164749488752557e-06, + "objective/entropy": -290.24542236328125, + "objective/kl": 33.52435302734375, + "objective/non_score_reward": -1.6762176752090454, + "objective/rlhf_reward": -5.254272381873474, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 2.0794928073883057, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.671875, + "step": 1307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9996331930160522 + }, + { + "episode": 20944, + "epoch": 0.12548681262057973, + "loss/policy_avg": -0.041130807250738144, + "lr": 9.164110429447854e-06, + "objective/entropy": -246.7947235107422, + "objective/kl": 30.54619598388672, + "objective/non_score_reward": -1.5273098945617676, + "objective/rlhf_reward": -4.658641557307586, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 59.35724639892578, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.64453125, + "step": 1308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9982240200042725 + }, + { + "episode": 20960, + "epoch": 0.12558267725970929, + "loss/policy_avg": 0.05561627447605133, + "lr": 9.163471370143149e-06, + "objective/entropy": -219.60110473632812, + "objective/kl": 30.905031204223633, + "objective/non_score_reward": -1.5452516078948975, + "objective/rlhf_reward": -4.757174451549616, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 7.214193344116211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58203125, + "step": 1309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004239082336426 + }, + { + "episode": 20976, + "epoch": 0.12567854189883884, + "loss/policy_avg": 0.42176759243011475, + "lr": 9.162832310838446e-06, + "objective/entropy": -211.8623504638672, + "objective/kl": 39.876808166503906, + "objective/non_score_reward": -1.9938405752182007, + "objective/rlhf_reward": -6.649849448233766, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 6.471524238586426, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.599609375, + "step": 1310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0001091957092285 + }, + { + "episode": 20992, + "epoch": 0.1257744065379684, + "loss/policy_avg": 0.1641611009836197, + "lr": 9.162193251533743e-06, + "objective/entropy": -272.44757080078125, + "objective/kl": 35.32935333251953, + "objective/non_score_reward": -1.7664676904678345, + "objective/rlhf_reward": -5.706620895598812, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 16.05602264404297, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.732421875, + "step": 1311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9982774257659912 + }, + { + "episode": 21008, + "epoch": 0.12587027117709793, + "loss/policy_avg": 0.10128459334373474, + "lr": 9.16155419222904e-06, + "objective/entropy": -218.8691864013672, + "objective/kl": 34.10152053833008, + "objective/non_score_reward": -1.7050760984420776, + "objective/rlhf_reward": -5.304532730373081, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 3.924506664276123, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.765625, + "step": 1312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0003185272216797 + }, + { + "episode": 21024, + "epoch": 0.12596613581622748, + "loss/policy_avg": -0.07266978919506073, + "lr": 9.160915132924337e-06, + "objective/entropy": -176.869140625, + "objective/kl": 32.847267150878906, + "objective/non_score_reward": -1.6423635482788086, + "objective/rlhf_reward": -4.907594447553741, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 4.819439888000488, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.646484375, + "step": 1313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0008370876312256 + }, + { + "episode": 21040, + "epoch": 0.12606200045535704, + "loss/policy_avg": 0.4377824664115906, + "lr": 9.160276073619634e-06, + "objective/entropy": -227.95974731445312, + "objective/kl": 32.87003707885742, + "objective/non_score_reward": -1.6435017585754395, + "objective/rlhf_reward": -5.2484944200813, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 101.58186340332031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.771484375, + "step": 1314, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.001458168029785 + }, + { + "episode": 21056, + "epoch": 0.1261578650944866, + "loss/policy_avg": 0.029068514704704285, + "lr": 9.159637014314929e-06, + "objective/entropy": -196.00814819335938, + "objective/kl": 41.65742492675781, + "objective/non_score_reward": -2.082871437072754, + "objective/rlhf_reward": -6.208778919950996, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 3.3673386573791504, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 1315, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9980723857879639 + }, + { + "episode": 21072, + "epoch": 0.12625372973361612, + "loss/policy_avg": 0.9132063984870911, + "lr": 9.158997955010226e-06, + "objective/entropy": -196.53677368164062, + "objective/kl": 34.47105026245117, + "objective/non_score_reward": -1.7235524654388428, + "objective/rlhf_reward": -6.8942097425460815, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7787117958068848, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4892578125, + "step": 1316, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.003011703491211 + }, + { + "episode": 21088, + "epoch": 0.12634959437274568, + "loss/policy_avg": -0.14771617949008942, + "lr": 9.158358895705522e-06, + "objective/entropy": -225.04312133789062, + "objective/kl": 25.410263061523438, + "objective/non_score_reward": -1.2705130577087402, + "objective/rlhf_reward": -5.08205258846283, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.080203056335449, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6953125, + "step": 1317, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0016398429870605 + }, + { + "episode": 21104, + "epoch": 0.12644545901187523, + "loss/policy_avg": -0.03344951570034027, + "lr": 9.15771983640082e-06, + "objective/entropy": -264.9842529296875, + "objective/kl": 34.70489501953125, + "objective/non_score_reward": -1.7352447509765625, + "objective/rlhf_reward": -4.9935674173402145, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 10.510156631469727, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62109375, + "step": 1318, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9990448951721191 + }, + { + "episode": 21120, + "epoch": 0.1265413236510048, + "loss/policy_avg": 0.14975669980049133, + "lr": 9.157080777096116e-06, + "objective/entropy": -281.6861572265625, + "objective/kl": 38.669654846191406, + "objective/non_score_reward": -1.9334828853607178, + "objective/rlhf_reward": -6.000597850481668, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 2.9204273223876953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.576171875, + "step": 1319, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9977920055389404 + }, + { + "episode": 21136, + "epoch": 0.12663718829013432, + "loss/policy_avg": -0.23473092913627625, + "lr": 9.156441717791411e-06, + "objective/entropy": -186.2064208984375, + "objective/kl": 33.10087203979492, + "objective/non_score_reward": -1.655043601989746, + "objective/rlhf_reward": -5.294661674529237, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 26.52355194091797, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.578125, + "step": 1320, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002742290496826 + }, + { + "episode": 21152, + "epoch": 0.12673305292926387, + "loss/policy_avg": 0.8872619867324829, + "lr": 9.155802658486708e-06, + "objective/entropy": -265.64398193359375, + "objective/kl": 33.104408264160156, + "objective/non_score_reward": -1.6552205085754395, + "objective/rlhf_reward": -4.887548581759134, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 29.555377960205078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69921875, + "step": 1321, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9992411136627197 + }, + { + "episode": 21168, + "epoch": 0.12682891756839343, + "loss/policy_avg": -0.05859680473804474, + "lr": 9.155163599182005e-06, + "objective/entropy": -179.13717651367188, + "objective/kl": 27.85260581970215, + "objective/non_score_reward": -1.3926303386688232, + "objective/rlhf_reward": -3.966401312414723, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 0.4925612211227417, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 1322, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0010387897491455 + }, + { + "episode": 21184, + "epoch": 0.12692478220752298, + "loss/policy_avg": -0.12246014177799225, + "lr": 9.1545245398773e-06, + "objective/entropy": -237.7357177734375, + "objective/kl": 34.5874137878418, + "objective/non_score_reward": -1.7293705940246582, + "objective/rlhf_reward": -5.466884355159149, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 4.104196548461914, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6328125, + "step": 1323, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0004804134368896 + }, + { + "episode": 21200, + "epoch": 0.1270206468466525, + "loss/policy_avg": 0.22411450743675232, + "lr": 9.153885480572597e-06, + "objective/entropy": -273.3883361816406, + "objective/kl": 42.53919219970703, + "objective/non_score_reward": -2.1269593238830566, + "objective/rlhf_reward": -6.845978026807892, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 4.946126937866211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.642578125, + "step": 1324, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9990512132644653 + }, + { + "episode": 21216, + "epoch": 0.12711651148578207, + "loss/policy_avg": 0.06539204716682434, + "lr": 9.153246421267894e-06, + "objective/entropy": -167.3392333984375, + "objective/kl": 33.024253845214844, + "objective/non_score_reward": -1.6512128114700317, + "objective/rlhf_reward": -5.048591821399286, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 4.672647476196289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.40234375, + "step": 1325, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.005505084991455 + }, + { + "episode": 21232, + "epoch": 0.12721237612491162, + "loss/policy_avg": 0.17451216280460358, + "lr": 9.152607361963191e-06, + "objective/entropy": -168.8487548828125, + "objective/kl": 26.45303726196289, + "objective/non_score_reward": -1.3226518630981445, + "objective/rlhf_reward": -3.965094718962831, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 1.9421508312225342, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51953125, + "step": 1326, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0126333236694336 + }, + { + "episode": 21248, + "epoch": 0.12730824076404118, + "loss/policy_avg": -0.005517004989087582, + "lr": 9.151968302658488e-06, + "objective/entropy": -176.31719970703125, + "objective/kl": 18.665822982788086, + "objective/non_score_reward": -0.9332911968231201, + "objective/rlhf_reward": -2.3093326284485736, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 1.5738120079040527, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.525390625, + "step": 1327, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999518394470215 + }, + { + "episode": 21264, + "epoch": 0.12740410540317074, + "loss/policy_avg": 0.1618424952030182, + "lr": 9.151329243353783e-06, + "objective/entropy": -217.6151123046875, + "objective/kl": 24.312286376953125, + "objective/non_score_reward": -1.2156143188476562, + "objective/rlhf_reward": -2.9150461656617477, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 30.008869171142578, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.74609375, + "step": 1328, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000408172607422 + }, + { + "episode": 21280, + "epoch": 0.12749997004230026, + "loss/policy_avg": 0.15089087188243866, + "lr": 9.15069018404908e-06, + "objective/entropy": -231.04893493652344, + "objective/kl": 34.12983322143555, + "objective/non_score_reward": -1.7064917087554932, + "objective/rlhf_reward": -5.48433118155542, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 37.14998245239258, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.78125, + "step": 1329, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9984626770019531 + }, + { + "episode": 21296, + "epoch": 0.12759583468142982, + "loss/policy_avg": 0.13896551728248596, + "lr": 9.150051124744377e-06, + "objective/entropy": -192.38351440429688, + "objective/kl": 22.335050582885742, + "objective/non_score_reward": -1.1167525053024292, + "objective/rlhf_reward": -3.0164120002702326, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 3.72003173828125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6875, + "step": 1330, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9977521896362305 + }, + { + "episode": 21312, + "epoch": 0.12769169932055938, + "loss/policy_avg": 0.038389697670936584, + "lr": 9.149412065439674e-06, + "objective/entropy": -206.938232421875, + "objective/kl": 23.90731430053711, + "objective/non_score_reward": -1.1953657865524292, + "objective/rlhf_reward": -3.265691363605198, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 1.9220904111862183, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.564453125, + "step": 1331, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0020275115966797 + }, + { + "episode": 21328, + "epoch": 0.12778756395968893, + "loss/policy_avg": 0.22985966503620148, + "lr": 9.14877300613497e-06, + "objective/entropy": -236.96868896484375, + "objective/kl": 20.446491241455078, + "objective/non_score_reward": -1.022324562072754, + "objective/rlhf_reward": -1.6892985463142396, + "objective/scores": 0.6, + "policy/approxkl_avg": 16.64617347717285, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.96875, + "step": 1332, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000746726989746 + }, + { + "episode": 21344, + "epoch": 0.12788342859881846, + "loss/policy_avg": 0.3077865540981293, + "lr": 9.148133946830266e-06, + "objective/entropy": -234.60574340820312, + "objective/kl": 38.31067657470703, + "objective/non_score_reward": -1.9155337810516357, + "objective/rlhf_reward": -4.7384163483392925, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.95224380493164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73046875, + "step": 1333, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998931884765625 + }, + { + "episode": 21360, + "epoch": 0.12797929323794802, + "loss/policy_avg": 0.13958078622817993, + "lr": 9.147494887525563e-06, + "objective/entropy": -273.56170654296875, + "objective/kl": 31.292470932006836, + "objective/non_score_reward": -1.5646235942840576, + "objective/rlhf_reward": -1.8584942579269406, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.8678287267684937, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619140625, + "step": 1334, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.997137188911438 + }, + { + "episode": 21376, + "epoch": 0.12807515787707757, + "loss/policy_avg": 0.42439746856689453, + "lr": 9.14685582822086e-06, + "objective/entropy": -267.5999755859375, + "objective/kl": 33.0029296875, + "objective/non_score_reward": -1.650146484375, + "objective/rlhf_reward": -5.2750730848609635, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 4.55873441696167, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.66015625, + "step": 1335, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9997644424438477 + }, + { + "episode": 21392, + "epoch": 0.12817102251620713, + "loss/policy_avg": 0.002216493710875511, + "lr": 9.146216768916156e-06, + "objective/entropy": -226.58786010742188, + "objective/kl": 22.239288330078125, + "objective/non_score_reward": -1.111964464187622, + "objective/rlhf_reward": -2.500446598009999, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 11.510183334350586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6953125, + "step": 1336, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985392093658447 + }, + { + "episode": 21408, + "epoch": 0.12826688715533666, + "loss/policy_avg": 0.2896654009819031, + "lr": 9.145577709611453e-06, + "objective/entropy": -275.9249267578125, + "objective/kl": 33.59234619140625, + "objective/non_score_reward": -1.679617166519165, + "objective/rlhf_reward": -4.893640096458506, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 2.7253494262695312, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.728515625, + "step": 1337, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9982388019561768 + }, + { + "episode": 21424, + "epoch": 0.1283627517944662, + "loss/policy_avg": 0.4936927556991577, + "lr": 9.14493865030675e-06, + "objective/entropy": -174.52462768554688, + "objective/kl": 30.66004180908203, + "objective/non_score_reward": -1.5330020189285278, + "objective/rlhf_reward": -4.7320081949234005, + "objective/scores": 0.35, + "policy/approxkl_avg": 3.5361199378967285, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.52734375, + "step": 1338, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999757170677185 + }, + { + "episode": 21440, + "epoch": 0.12845861643359577, + "loss/policy_avg": 0.4622963070869446, + "lr": 9.144299591002045e-06, + "objective/entropy": -278.365966796875, + "objective/kl": 37.1561393737793, + "objective/non_score_reward": -1.8578070402145386, + "objective/rlhf_reward": -5.606399412426065, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 12.119524002075195, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6640625, + "step": 1339, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9970488548278809 + }, + { + "episode": 21456, + "epoch": 0.12855448107272532, + "loss/policy_avg": 0.1313559114933014, + "lr": 9.143660531697342e-06, + "objective/entropy": -254.86607360839844, + "objective/kl": 35.33464813232422, + "objective/non_score_reward": -1.7667322158813477, + "objective/rlhf_reward": -4.143209849239561, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 22.45963478088379, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 1340, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9981313943862915 + }, + { + "episode": 21472, + "epoch": 0.12865034571185485, + "loss/policy_avg": 0.38973551988601685, + "lr": 9.143021472392639e-06, + "objective/entropy": -272.54193115234375, + "objective/kl": 27.13404655456543, + "objective/non_score_reward": -1.3567023277282715, + "objective/rlhf_reward": -3.693476096789042, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.679624080657959, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.658203125, + "step": 1341, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9994499683380127 + }, + { + "episode": 21488, + "epoch": 0.1287462103509844, + "loss/policy_avg": 0.11905691772699356, + "lr": 9.142382413087936e-06, + "objective/entropy": -210.89501953125, + "objective/kl": 28.64351463317871, + "objective/non_score_reward": -1.432175636291504, + "objective/rlhf_reward": -2.804983530880186, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.039191246032715, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.603515625, + "step": 1342, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9983291625976562 + }, + { + "episode": 21504, + "epoch": 0.12884207499011396, + "loss/policy_avg": 0.14720244705677032, + "lr": 9.141743353783233e-06, + "objective/entropy": -224.0950164794922, + "objective/kl": 25.995969772338867, + "objective/non_score_reward": -1.2997984886169434, + "objective/rlhf_reward": -2.2754748209726543, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.1929235458374023, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.771484375, + "step": 1343, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9995906352996826 + }, + { + "episode": 21520, + "epoch": 0.12893793962924352, + "loss/policy_avg": 0.17890335619449615, + "lr": 9.14110429447853e-06, + "objective/entropy": -250.506103515625, + "objective/kl": 29.027278900146484, + "objective/non_score_reward": -1.4513640403747559, + "objective/rlhf_reward": -1.405456072092056, + "objective/scores": 1.1, + "policy/approxkl_avg": 12.065425872802734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.80078125, + "step": 1344, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9962573051452637 + }, + { + "episode": 21536, + "epoch": 0.12903380426837305, + "loss/policy_avg": 0.08815973997116089, + "lr": 9.140465235173825e-06, + "objective/entropy": -284.2688293457031, + "objective/kl": 30.836158752441406, + "objective/non_score_reward": -1.5418078899383545, + "objective/rlhf_reward": -4.5053721718197925, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 6.909310340881348, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.783203125, + "step": 1345, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9966943264007568 + }, + { + "episode": 21552, + "epoch": 0.1291296689075026, + "loss/policy_avg": 0.024522747844457626, + "lr": 9.13982617586912e-06, + "objective/entropy": -161.51828002929688, + "objective/kl": 30.234634399414062, + "objective/non_score_reward": -1.5117316246032715, + "objective/rlhf_reward": -4.721413645774049, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 1.0650488138198853, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.587890625, + "step": 1346, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 2.0002496242523193 + }, + { + "episode": 21568, + "epoch": 0.12922553354663216, + "loss/policy_avg": 0.5536386966705322, + "lr": 9.139187116564417e-06, + "objective/entropy": -235.6590118408203, + "objective/kl": 32.029144287109375, + "objective/non_score_reward": -1.6014573574066162, + "objective/rlhf_reward": -4.283122958914314, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 21.44164276123047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.619140625, + "step": 1347, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997905969619751 + }, + { + "episode": 21584, + "epoch": 0.12932139818576172, + "loss/policy_avg": -0.14616435766220093, + "lr": 9.138548057259714e-06, + "objective/entropy": -236.11582946777344, + "objective/kl": 26.366846084594727, + "objective/non_score_reward": -1.3183423280715942, + "objective/rlhf_reward": -3.947856519251985, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 8.052356719970703, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.68359375, + "step": 1348, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.003175735473633 + }, + { + "episode": 21600, + "epoch": 0.12941726282489124, + "loss/policy_avg": 0.22792214155197144, + "lr": 9.13790899795501e-06, + "objective/entropy": -209.01907348632812, + "objective/kl": 33.44483184814453, + "objective/non_score_reward": -1.6722415685653687, + "objective/rlhf_reward": -5.31036422499786, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 15.402750015258789, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.81640625, + "step": 1349, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000248432159424 + }, + { + "episode": 21616, + "epoch": 0.1295131274640208, + "loss/policy_avg": 0.20839962363243103, + "lr": 9.137269938650308e-06, + "objective/entropy": -292.2127990722656, + "objective/kl": 29.052120208740234, + "objective/non_score_reward": -1.452605962753296, + "objective/rlhf_reward": -1.4104240894317623, + "objective/scores": 1.1, + "policy/approxkl_avg": 76.34044647216797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.580078125, + "step": 1350, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.997493028640747 + }, + { + "episode": 21632, + "epoch": 0.12960899210315036, + "loss/policy_avg": -0.08632227778434753, + "lr": 9.136630879345604e-06, + "objective/entropy": -173.5177764892578, + "objective/kl": 29.301441192626953, + "objective/non_score_reward": -1.4650721549987793, + "objective/rlhf_reward": -4.481686213103634, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 2.284684658050537, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.60546875, + "step": 1351, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002286434173584 + }, + { + "episode": 21648, + "epoch": 0.1297048567422799, + "loss/policy_avg": 0.10895340144634247, + "lr": 9.1359918200409e-06, + "objective/entropy": -279.0048828125, + "objective/kl": 34.87440872192383, + "objective/non_score_reward": -1.7437204122543335, + "objective/rlhf_reward": -2.5748816490173336, + "objective/scores": 1.1, + "policy/approxkl_avg": 13.06716537475586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.583984375, + "step": 1352, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.997962236404419 + }, + { + "episode": 21664, + "epoch": 0.12980072138140944, + "loss/policy_avg": 0.2365398406982422, + "lr": 9.135352760736197e-06, + "objective/entropy": -250.8545379638672, + "objective/kl": 30.62120819091797, + "objective/non_score_reward": -1.5310604572296143, + "objective/rlhf_reward": -3.2005228146326274, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 25.900188446044922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.63671875, + "step": 1353, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997548222541809 + }, + { + "episode": 21680, + "epoch": 0.129896586020539, + "loss/policy_avg": -0.0653112605214119, + "lr": 9.134713701431493e-06, + "objective/entropy": -263.48004150390625, + "objective/kl": 24.03810691833496, + "objective/non_score_reward": -1.2019054889678955, + "objective/rlhf_reward": -2.407621836662292, + "objective/scores": 0.6, + "policy/approxkl_avg": 2.6270973682403564, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.62890625, + "step": 1354, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.00001859664917 + }, + { + "episode": 21696, + "epoch": 0.12999245065966855, + "loss/policy_avg": 0.24467766284942627, + "lr": 9.13407464212679e-06, + "objective/entropy": -237.13613891601562, + "objective/kl": 24.655006408691406, + "objective/non_score_reward": -1.2327501773834229, + "objective/rlhf_reward": -0.5310010671615597, + "objective/scores": 1.1, + "policy/approxkl_avg": 23.784618377685547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 1355, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.004101276397705 + }, + { + "episode": 21712, + "epoch": 0.1300883152987981, + "loss/policy_avg": 0.0691906288266182, + "lr": 9.133435582822087e-06, + "objective/entropy": -244.44912719726562, + "objective/kl": 30.4073486328125, + "objective/non_score_reward": -1.5203675031661987, + "objective/rlhf_reward": -1.6814697742462155, + "objective/scores": 1.1, + "policy/approxkl_avg": 0.9179539680480957, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6640625, + "step": 1356, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001126289367676 + }, + { + "episode": 21728, + "epoch": 0.13018417993792764, + "loss/policy_avg": 0.21818453073501587, + "lr": 9.132796523517384e-06, + "objective/entropy": -227.47018432617188, + "objective/kl": 28.718124389648438, + "objective/non_score_reward": -1.435906171798706, + "objective/rlhf_reward": -3.918795819553446, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 0.8305081129074097, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.697265625, + "step": 1357, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9998927116394043 + }, + { + "episode": 21744, + "epoch": 0.1302800445770572, + "loss/policy_avg": 0.5288101434707642, + "lr": 9.13215746421268e-06, + "objective/entropy": -254.03286743164062, + "objective/kl": 40.13897705078125, + "objective/non_score_reward": -2.006948947906494, + "objective/rlhf_reward": -6.471536367145136, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.1408591270446777, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5859375, + "step": 1358, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989702701568604 + }, + { + "episode": 21760, + "epoch": 0.13037590921618675, + "loss/policy_avg": -0.015707701444625854, + "lr": 9.131518404907976e-06, + "objective/entropy": -235.0547637939453, + "objective/kl": 34.96942901611328, + "objective/non_score_reward": -1.7484712600708008, + "objective/rlhf_reward": -5.57005317946252, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 39.12907409667969, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.666015625, + "step": 1359, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975736141204834 + }, + { + "episode": 21776, + "epoch": 0.1304717738553163, + "loss/policy_avg": -0.031348615884780884, + "lr": 9.130879345603273e-06, + "objective/entropy": -216.28042602539062, + "objective/kl": 31.17209243774414, + "objective/non_score_reward": -1.5586044788360596, + "objective/rlhf_reward": -4.111712040678535, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 0.8966926336288452, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53125, + "step": 1360, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001847267150879 + }, + { + "episode": 21792, + "epoch": 0.13056763849444583, + "loss/policy_avg": 0.30280882120132446, + "lr": 9.13024028629857e-06, + "objective/entropy": -213.22189331054688, + "objective/kl": 31.471433639526367, + "objective/non_score_reward": -1.573571801185608, + "objective/rlhf_reward": -4.632427459180938, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 3.3364553451538086, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.595703125, + "step": 1361, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9998619556427002 + }, + { + "episode": 21808, + "epoch": 0.1306635031335754, + "loss/policy_avg": 0.20570358633995056, + "lr": 9.129601226993867e-06, + "objective/entropy": -179.83119201660156, + "objective/kl": 25.478784561157227, + "objective/non_score_reward": -1.2739393711090088, + "objective/rlhf_reward": -3.6957572460174557, + "objective/scores": 0.35, + "policy/approxkl_avg": 90.71241760253906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55078125, + "step": 1362, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997685194015503 + }, + { + "episode": 21824, + "epoch": 0.13075936777270494, + "loss/policy_avg": 0.6416128873825073, + "lr": 9.128962167689162e-06, + "objective/entropy": -272.2728271484375, + "objective/kl": 32.88115692138672, + "objective/non_score_reward": -1.6440578699111938, + "objective/rlhf_reward": -5.019972293582514, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 12.951141357421875, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.572265625, + "step": 1363, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987308979034424 + }, + { + "episode": 21840, + "epoch": 0.1308552324118345, + "loss/policy_avg": -0.41209667921066284, + "lr": 9.128323108384459e-06, + "objective/entropy": -244.28286743164062, + "objective/kl": 30.94601058959961, + "objective/non_score_reward": -1.5473005771636963, + "objective/rlhf_reward": -3.2654829963457317, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 22.515792846679688, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.501953125, + "step": 1364, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0037689208984375 + }, + { + "episode": 21856, + "epoch": 0.13095109705096403, + "loss/policy_avg": -0.01563386619091034, + "lr": 9.127684049079756e-06, + "objective/entropy": -255.2119140625, + "objective/kl": 27.473278045654297, + "objective/non_score_reward": -1.3736639022827148, + "objective/rlhf_reward": -3.8905355072656445, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.797173261642456, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.541015625, + "step": 1365, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9988737106323242 + }, + { + "episode": 21872, + "epoch": 0.13104696169009358, + "loss/policy_avg": 0.23629775643348694, + "lr": 9.127044989775053e-06, + "objective/entropy": -272.95880126953125, + "objective/kl": 34.79148483276367, + "objective/non_score_reward": -1.7395741939544678, + "objective/rlhf_reward": -6.958296895027161, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5488548278808594, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.57421875, + "step": 1366, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 2.0178794860839844 + }, + { + "episode": 21888, + "epoch": 0.13114282632922314, + "loss/policy_avg": -0.18055079877376556, + "lr": 9.126405930470348e-06, + "objective/entropy": -238.3826904296875, + "objective/kl": 31.74860191345215, + "objective/non_score_reward": -1.5874300003051758, + "objective/rlhf_reward": -4.8991220994905085, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 6.933784484863281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.638671875, + "step": 1367, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982986450195312 + }, + { + "episode": 21904, + "epoch": 0.1312386909683527, + "loss/policy_avg": 0.5160447359085083, + "lr": 9.125766871165645e-06, + "objective/entropy": -277.48004150390625, + "objective/kl": 33.97571563720703, + "objective/non_score_reward": -1.6987860202789307, + "objective/rlhf_reward": -4.6724373719849925, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 15.784793853759766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6015625, + "step": 1368, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9972174167633057 + }, + { + "episode": 21920, + "epoch": 0.13133455560748222, + "loss/policy_avg": 0.09226138889789581, + "lr": 9.125127811860942e-06, + "objective/entropy": -288.6790466308594, + "objective/kl": 26.108116149902344, + "objective/non_score_reward": -1.305405855178833, + "objective/rlhf_reward": -0.8216233015060421, + "objective/scores": 1.1, + "policy/approxkl_avg": 34.192481994628906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.55078125, + "step": 1369, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981337785720825 + }, + { + "episode": 21936, + "epoch": 0.13143042024661178, + "loss/policy_avg": -0.005547836422920227, + "lr": 9.124488752556238e-06, + "objective/entropy": -261.7656555175781, + "objective/kl": 39.76494216918945, + "objective/non_score_reward": -1.988247036933899, + "objective/rlhf_reward": -6.291128640592682, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 1.2966415882110596, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.63671875, + "step": 1370, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999891996383667 + }, + { + "episode": 21952, + "epoch": 0.13152628488574133, + "loss/policy_avg": 0.1910111904144287, + "lr": 9.123849693251534e-06, + "objective/entropy": -71.11714935302734, + "objective/kl": 37.65461730957031, + "objective/non_score_reward": -1.8827309608459473, + "objective/rlhf_reward": -4.607204352260801, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.529366493225098, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.763671875, + "step": 1371, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.99833345413208 + }, + { + "episode": 21968, + "epoch": 0.1316221495248709, + "loss/policy_avg": 1.7684245109558105, + "lr": 9.12321063394683e-06, + "objective/entropy": -231.41371154785156, + "objective/kl": 34.49894714355469, + "objective/non_score_reward": -1.7249473333358765, + "objective/rlhf_reward": -5.237929766595946, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 61.1027717590332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5546875, + "step": 1372, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997320175170898 + }, + { + "episode": 21984, + "epoch": 0.13171801416400042, + "loss/policy_avg": -0.03314230218529701, + "lr": 9.122571574642127e-06, + "objective/entropy": -290.369384765625, + "objective/kl": 32.8198127746582, + "objective/non_score_reward": -1.6409904956817627, + "objective/rlhf_reward": -5.2384496069251725, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 13.562549591064453, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6171875, + "step": 1373, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0022025108337402 + }, + { + "episode": 22000, + "epoch": 0.13181387880312997, + "loss/policy_avg": 0.11191444098949432, + "lr": 9.121932515337424e-06, + "objective/entropy": -228.10528564453125, + "objective/kl": 38.98127746582031, + "objective/non_score_reward": -1.9490638971328735, + "objective/rlhf_reward": -6.470742616683168, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 2.0331411361694336, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.564453125, + "step": 1374, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986833333969116 + }, + { + "episode": 22016, + "epoch": 0.13190974344225953, + "loss/policy_avg": -0.26164868474006653, + "lr": 9.121293456032721e-06, + "objective/entropy": -226.21148681640625, + "objective/kl": 34.36164855957031, + "objective/non_score_reward": -1.718082308769226, + "objective/rlhf_reward": -5.4723293542861935, + "objective/scores": 0.35, + "policy/approxkl_avg": 3.828913450241089, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.673828125, + "step": 1375, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985249042510986 + }, + { + "episode": 22032, + "epoch": 0.1320056080813891, + "loss/policy_avg": 0.21197950839996338, + "lr": 9.120654396728016e-06, + "objective/entropy": -255.317138671875, + "objective/kl": 44.30939865112305, + "objective/non_score_reward": -2.2154700756073, + "objective/rlhf_reward": -7.483277657119137, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 5.002331733703613, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619140625, + "step": 1376, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9972087144851685 + }, + { + "episode": 22048, + "epoch": 0.13210147272051861, + "loss/policy_avg": 0.2008858621120453, + "lr": 9.120015337423313e-06, + "objective/entropy": -194.98388671875, + "objective/kl": 29.178813934326172, + "objective/non_score_reward": -1.4589406251907349, + "objective/rlhf_reward": -4.510249648123903, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 1.7124892473220825, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5546875, + "step": 1377, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.999535322189331 + }, + { + "episode": 22064, + "epoch": 0.13219733735964817, + "loss/policy_avg": 0.2521211802959442, + "lr": 9.11937627811861e-06, + "objective/entropy": -275.01416015625, + "objective/kl": 40.16548538208008, + "objective/non_score_reward": -2.008274555206299, + "objective/rlhf_reward": -6.707584772139711, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 8.357677459716797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.640625, + "step": 1378, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9994690418243408 + }, + { + "episode": 22080, + "epoch": 0.13229320199877773, + "loss/policy_avg": 0.20335987210273743, + "lr": 9.118737218813907e-06, + "objective/entropy": -219.05987548828125, + "objective/kl": 28.999086380004883, + "objective/non_score_reward": -1.4499542713165283, + "objective/rlhf_reward": -4.421214976397854, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 1.1380681991577148, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6015625, + "step": 1379, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001814365386963 + }, + { + "episode": 22096, + "epoch": 0.13238906663790728, + "loss/policy_avg": -0.17124547064304352, + "lr": 9.118098159509204e-06, + "objective/entropy": -102.70747375488281, + "objective/kl": 36.22713851928711, + "objective/non_score_reward": -1.8113569021224976, + "objective/rlhf_reward": -5.866825440017086, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 0.37671273946762085, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.560546875, + "step": 1380, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001765251159668 + }, + { + "episode": 22112, + "epoch": 0.1324849312770368, + "loss/policy_avg": 0.6594262719154358, + "lr": 9.1174591002045e-06, + "objective/entropy": -239.76181030273438, + "objective/kl": 38.55724334716797, + "objective/non_score_reward": -1.9278624057769775, + "objective/rlhf_reward": -6.332847335425717, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 63.58997344970703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.748046875, + "step": 1381, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9986870288848877 + }, + { + "episode": 22128, + "epoch": 0.13258079591616637, + "loss/policy_avg": 0.26639020442962646, + "lr": 9.116820040899796e-06, + "objective/entropy": -228.43006896972656, + "objective/kl": 41.377357482910156, + "objective/non_score_reward": -2.0688676834106445, + "objective/rlhf_reward": -5.875471210479736, + "objective/scores": 0.6, + "policy/approxkl_avg": 24.236587524414062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.580078125, + "step": 1382, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.997843623161316 + }, + { + "episode": 22144, + "epoch": 0.13267666055529592, + "loss/policy_avg": 0.22560517489910126, + "lr": 9.116180981595093e-06, + "objective/entropy": -275.1982421875, + "objective/kl": 33.85704040527344, + "objective/non_score_reward": -1.6928520202636719, + "objective/rlhf_reward": -4.371408081054687, + "objective/scores": 0.6, + "policy/approxkl_avg": 50.331180572509766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.595703125, + "step": 1383, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996649980545044 + }, + { + "episode": 22160, + "epoch": 0.13277252519442548, + "loss/policy_avg": 0.3458302617073059, + "lr": 9.11554192229039e-06, + "objective/entropy": -284.34478759765625, + "objective/kl": 41.527374267578125, + "objective/non_score_reward": -2.076368808746338, + "objective/rlhf_reward": -6.6436157278424375, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 3.7493739128112793, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.615234375, + "step": 1384, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000601291656494 + }, + { + "episode": 22176, + "epoch": 0.13286838983355503, + "loss/policy_avg": 0.04170902818441391, + "lr": 9.114902862985686e-06, + "objective/entropy": -232.59671020507812, + "objective/kl": 23.403087615966797, + "objective/non_score_reward": -1.1701544523239136, + "objective/rlhf_reward": -3.2806179285049435, + "objective/scores": 0.35, + "policy/approxkl_avg": 11.652734756469727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.642578125, + "step": 1385, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0049519538879395 + }, + { + "episode": 22192, + "epoch": 0.13296425447268456, + "loss/policy_avg": 0.7652486562728882, + "lr": 9.114263803680983e-06, + "objective/entropy": -264.1070251464844, + "objective/kl": 38.55071258544922, + "objective/non_score_reward": -1.9275355339050293, + "objective/rlhf_reward": -5.885313267978739, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 16.973102569580078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.732421875, + "step": 1386, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9949281215667725 + }, + { + "episode": 22208, + "epoch": 0.13306011911181412, + "loss/policy_avg": 0.26107269525527954, + "lr": 9.113624744376279e-06, + "objective/entropy": -274.548828125, + "objective/kl": 29.125957489013672, + "objective/non_score_reward": -1.4562978744506836, + "objective/rlhf_reward": -4.37459359607254, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 11.973018646240234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.669921875, + "step": 1387, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.997715711593628 + }, + { + "episode": 22224, + "epoch": 0.13315598375094367, + "loss/policy_avg": 0.1225675493478775, + "lr": 9.112985685071575e-06, + "objective/entropy": -218.16091918945312, + "objective/kl": 35.720855712890625, + "objective/non_score_reward": -1.7860426902770996, + "objective/rlhf_reward": -5.196759770588811, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 32.86650085449219, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8046875, + "step": 1388, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0017457008361816 + }, + { + "episode": 22240, + "epoch": 0.13325184839007323, + "loss/policy_avg": 0.343703955411911, + "lr": 9.112346625766872e-06, + "objective/entropy": -225.82559204101562, + "objective/kl": 24.557886123657227, + "objective/non_score_reward": -1.2278943061828613, + "objective/rlhf_reward": -3.5523273584589194, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.353696584701538, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.671875, + "step": 1389, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.00032901763916 + }, + { + "episode": 22256, + "epoch": 0.13334771302920276, + "loss/policy_avg": 0.22060903906822205, + "lr": 9.111707566462168e-06, + "objective/entropy": -259.0480651855469, + "objective/kl": 25.11700439453125, + "objective/non_score_reward": -1.2558501958847046, + "objective/rlhf_reward": -3.467141597476557, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.6127742528915405, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.638671875, + "step": 1390, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9998914003372192 + }, + { + "episode": 22272, + "epoch": 0.1334435776683323, + "loss/policy_avg": 0.0028184684924781322, + "lr": 9.111068507157464e-06, + "objective/entropy": -277.342041015625, + "objective/kl": 34.34157180786133, + "objective/non_score_reward": -1.7170785665512085, + "objective/rlhf_reward": -5.352542483600315, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 6.7859721183776855, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.650390625, + "step": 1391, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9976534843444824 + }, + { + "episode": 22288, + "epoch": 0.13353944230746187, + "loss/policy_avg": 0.2673591077327728, + "lr": 9.110429447852761e-06, + "objective/entropy": -272.81146240234375, + "objective/kl": 32.08586883544922, + "objective/non_score_reward": -1.6042933464050293, + "objective/rlhf_reward": -4.993341405590144, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 66.21798706054688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.57421875, + "step": 1392, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9972776174545288 + }, + { + "episode": 22304, + "epoch": 0.13363530694659143, + "loss/policy_avg": 0.9891442060470581, + "lr": 9.109790388548058e-06, + "objective/entropy": -157.80642700195312, + "objective/kl": 41.81775665283203, + "objective/non_score_reward": -2.0908877849578857, + "objective/rlhf_reward": -7.02191524794641, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 140.4240264892578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 1393, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9975658655166626 + }, + { + "episode": 22320, + "epoch": 0.13373117158572095, + "loss/policy_avg": 0.6556341052055359, + "lr": 9.109151329243355e-06, + "objective/entropy": -253.7165069580078, + "objective/kl": 27.159475326538086, + "objective/non_score_reward": -1.3579738140106201, + "objective/rlhf_reward": -3.3091887853303295, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 15.467697143554688, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.47265625, + "step": 1394, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.006256580352783 + }, + { + "episode": 22336, + "epoch": 0.1338270362248505, + "loss/policy_avg": -0.04786435142159462, + "lr": 9.10851226993865e-06, + "objective/entropy": -281.2685546875, + "objective/kl": 30.538103103637695, + "objective/non_score_reward": -1.5269051790237427, + "objective/rlhf_reward": -4.591848814281162, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 12.983705520629883, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.544921875, + "step": 1395, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0013580322265625 + }, + { + "episode": 22352, + "epoch": 0.13392290086398007, + "loss/policy_avg": -0.14957058429718018, + "lr": 9.107873210633947e-06, + "objective/entropy": -260.2832946777344, + "objective/kl": 37.010498046875, + "objective/non_score_reward": -1.85052490234375, + "objective/rlhf_reward": -6.0604638366991574, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 4.588924407958984, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6875, + "step": 1396, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982390403747559 + }, + { + "episode": 22368, + "epoch": 0.13401876550310962, + "loss/policy_avg": 0.03792187571525574, + "lr": 9.107234151329244e-06, + "objective/entropy": -202.82089233398438, + "objective/kl": 28.890417098999023, + "objective/non_score_reward": -1.4445207118988037, + "objective/rlhf_reward": -4.418833160136623, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 2.8451006412506104, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 1397, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0012331008911133 + }, + { + "episode": 22384, + "epoch": 0.13411463014223915, + "loss/policy_avg": 0.26423919200897217, + "lr": 9.10659509202454e-06, + "objective/entropy": -199.65274047851562, + "objective/kl": 19.250408172607422, + "objective/non_score_reward": -0.9625204205513, + "objective/rlhf_reward": -2.245961788956242, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 6.399721145629883, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.638671875, + "step": 1398, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9996623992919922 + }, + { + "episode": 22400, + "epoch": 0.1342104947813687, + "loss/policy_avg": 0.2948363423347473, + "lr": 9.105956032719838e-06, + "objective/entropy": -214.3868865966797, + "objective/kl": 25.899425506591797, + "objective/non_score_reward": -1.294971227645874, + "objective/rlhf_reward": -3.820635103915615, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 11.461451530456543, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.51171875, + "step": 1399, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9998140335083008 + }, + { + "episode": 22416, + "epoch": 0.13430635942049826, + "loss/policy_avg": -0.059535130858421326, + "lr": 9.105316973415133e-06, + "objective/entropy": -251.36669921875, + "objective/kl": 24.37100601196289, + "objective/non_score_reward": -1.218550205230713, + "objective/rlhf_reward": -3.474200969934463, + "objective/scores": 0.35, + "policy/approxkl_avg": 6.021501541137695, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5234375, + "step": 1400, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0187039375305176 + }, + { + "episode": 22432, + "epoch": 0.13440222405962782, + "loss/policy_avg": 0.09361746907234192, + "lr": 9.10467791411043e-06, + "objective/entropy": -250.89463806152344, + "objective/kl": 40.795570373535156, + "objective/non_score_reward": -2.039778470993042, + "objective/rlhf_reward": -6.759113764762878, + "objective/scores": 0.35, + "policy/approxkl_avg": 2.102271795272827, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.697265625, + "step": 1401, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9986069202423096 + }, + { + "episode": 22448, + "epoch": 0.13449808869875735, + "loss/policy_avg": 0.3411298990249634, + "lr": 9.104038854805727e-06, + "objective/entropy": -189.4188232421875, + "objective/kl": 21.962203979492188, + "objective/non_score_reward": -1.0981099605560303, + "objective/rlhf_reward": -2.9418420597032156, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 3.4877538681030273, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5234375, + "step": 1402, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997504711151123 + }, + { + "episode": 22464, + "epoch": 0.1345939533378869, + "loss/policy_avg": 0.1954708993434906, + "lr": 9.103399795501024e-06, + "objective/entropy": -215.69268798828125, + "objective/kl": 37.367271423339844, + "objective/non_score_reward": -1.868363618850708, + "objective/rlhf_reward": -6.022856573672637, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 3.3153672218322754, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.544921875, + "step": 1403, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9988248348236084 + }, + { + "episode": 22480, + "epoch": 0.13468981797701646, + "loss/policy_avg": -0.3388468027114868, + "lr": 9.10276073619632e-06, + "objective/entropy": -247.29574584960938, + "objective/kl": 33.42229080200195, + "objective/non_score_reward": -1.671114444732666, + "objective/rlhf_reward": -5.080338153902607, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 8.782747268676758, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5390625, + "step": 1404, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0028865337371826 + }, + { + "episode": 22496, + "epoch": 0.134785682616146, + "loss/policy_avg": 0.3877559304237366, + "lr": 9.102121676891617e-06, + "objective/entropy": -236.95657348632812, + "objective/kl": 35.144378662109375, + "objective/non_score_reward": -1.7572189569473267, + "objective/rlhf_reward": -4.906169237867866, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 23.945987701416016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 1405, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998518705368042 + }, + { + "episode": 22512, + "epoch": 0.13488154725527554, + "loss/policy_avg": 0.1250351220369339, + "lr": 9.101482617586912e-06, + "objective/entropy": -219.36123657226562, + "objective/kl": 27.327880859375, + "objective/non_score_reward": -1.36639404296875, + "objective/rlhf_reward": -4.041744013031093, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 17.805503845214844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.63671875, + "step": 1406, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9986976385116577 + }, + { + "episode": 22528, + "epoch": 0.1349774118944051, + "loss/policy_avg": 0.07265815138816833, + "lr": 9.10084355828221e-06, + "objective/entropy": -289.05718994140625, + "objective/kl": 26.725826263427734, + "objective/non_score_reward": -1.3362910747528076, + "objective/rlhf_reward": -3.829392933639225, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 4.363107204437256, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.603515625, + "step": 1407, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9994230270385742 + }, + { + "episode": 22544, + "epoch": 0.13507327653353465, + "loss/policy_avg": -0.3271891176700592, + "lr": 9.100204498977506e-06, + "objective/entropy": -243.45018005371094, + "objective/kl": 35.21052551269531, + "objective/non_score_reward": -1.7605262994766235, + "objective/rlhf_reward": -5.70046954443994, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 12.5887451171875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.611328125, + "step": 1408, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998884201049805 + }, + { + "episode": 22560, + "epoch": 0.1351691411726642, + "loss/policy_avg": 0.20253193378448486, + "lr": 9.099565439672803e-06, + "objective/entropy": -197.681640625, + "objective/kl": 29.323577880859375, + "objective/non_score_reward": -1.4661788940429688, + "objective/rlhf_reward": -3.917304108815129, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 0.8307449817657471, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 1409, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.00246524810791 + }, + { + "episode": 22576, + "epoch": 0.13526500581179374, + "loss/policy_avg": 0.3828544020652771, + "lr": 9.0989263803681e-06, + "objective/entropy": -283.22674560546875, + "objective/kl": 28.88727378845215, + "objective/non_score_reward": -1.4443637132644653, + "objective/rlhf_reward": -4.435819080382019, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 37.23944854736328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.61328125, + "step": 1410, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9961497783660889 + }, + { + "episode": 22592, + "epoch": 0.1353608704509233, + "loss/policy_avg": 0.0013767257332801819, + "lr": 9.098287321063395e-06, + "objective/entropy": -14.409706115722656, + "objective/kl": 35.32271194458008, + "objective/non_score_reward": -1.7661356925964355, + "objective/rlhf_reward": -5.705292427276058, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 19.86726188659668, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.67578125, + "step": 1411, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000868797302246 + }, + { + "episode": 22608, + "epoch": 0.13545673509005285, + "loss/policy_avg": 0.3584628403186798, + "lr": 9.097648261758692e-06, + "objective/entropy": -275.2825927734375, + "objective/kl": 30.668413162231445, + "objective/non_score_reward": -1.5334208011627197, + "objective/rlhf_reward": -4.186271618084843, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 3.2139475345611572, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 1412, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9992797374725342 + }, + { + "episode": 22624, + "epoch": 0.1355525997291824, + "loss/policy_avg": 0.07676204293966293, + "lr": 9.097009202453987e-06, + "objective/entropy": -161.75140380859375, + "objective/kl": 26.458412170410156, + "objective/non_score_reward": -1.322920560836792, + "objective/rlhf_reward": -3.8107296256378884, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 1.3583320379257202, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6484375, + "step": 1413, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001204013824463 + }, + { + "episode": 22640, + "epoch": 0.13564846436831193, + "loss/policy_avg": -0.348129540681839, + "lr": 9.096370143149284e-06, + "objective/entropy": -161.31414794921875, + "objective/kl": 40.132015228271484, + "objective/non_score_reward": -2.006600856781006, + "objective/rlhf_reward": -6.470143883433893, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 9.361883163452148, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.69921875, + "step": 1414, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000566005706787 + }, + { + "episode": 22656, + "epoch": 0.1357443290074415, + "loss/policy_avg": 0.14402732253074646, + "lr": 9.095731083844581e-06, + "objective/entropy": -256.77880859375, + "objective/kl": 29.49087142944336, + "objective/non_score_reward": -1.474543571472168, + "objective/rlhf_reward": -4.3824026224934425, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 2.047593355178833, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.634765625, + "step": 1415, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0007848739624023 + }, + { + "episode": 22672, + "epoch": 0.13584019364657104, + "loss/policy_avg": -0.055459946393966675, + "lr": 9.095092024539878e-06, + "objective/entropy": -226.90335083007812, + "objective/kl": 35.10498809814453, + "objective/non_score_reward": -1.7552495002746582, + "objective/rlhf_reward": -5.570399980159149, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 30.274810791015625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.69921875, + "step": 1416, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.999528408050537 + }, + { + "episode": 22688, + "epoch": 0.1359360582857006, + "loss/policy_avg": 0.27508485317230225, + "lr": 9.094452965235175e-06, + "objective/entropy": -143.74124145507812, + "objective/kl": 37.461273193359375, + "objective/non_score_reward": -1.8730638027191162, + "objective/rlhf_reward": -5.092255330085754, + "objective/scores": 0.6, + "policy/approxkl_avg": 2.542268753051758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.36328125, + "step": 1417, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973454475402832 + }, + { + "episode": 22704, + "epoch": 0.13603192292483013, + "loss/policy_avg": 0.22670505940914154, + "lr": 9.093813905930472e-06, + "objective/entropy": -235.04212951660156, + "objective/kl": 27.81060791015625, + "objective/non_score_reward": -1.3905303478240967, + "objective/rlhf_reward": -4.046349847110447, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 18.150178909301758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.828125, + "step": 1418, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9964942932128906 + }, + { + "episode": 22720, + "epoch": 0.13612778756395968, + "loss/policy_avg": -0.00467962771654129, + "lr": 9.093174846625767e-06, + "objective/entropy": -241.10305786132812, + "objective/kl": 36.07202911376953, + "objective/non_score_reward": -1.803601622581482, + "objective/rlhf_reward": -5.790574391086665, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 10.899436950683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.609375, + "step": 1419, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9986159801483154 + }, + { + "episode": 22736, + "epoch": 0.13622365220308924, + "loss/policy_avg": 0.11034490168094635, + "lr": 9.092535787321064e-06, + "objective/entropy": -318.0650939941406, + "objective/kl": 31.329748153686523, + "objective/non_score_reward": -1.5664875507354736, + "objective/rlhf_reward": -4.5326165119806925, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.8243210315704346, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.66015625, + "step": 1420, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999915599822998 + }, + { + "episode": 22752, + "epoch": 0.1363195168422188, + "loss/policy_avg": 0.04376043379306793, + "lr": 9.09189672801636e-06, + "objective/entropy": -257.55560302734375, + "objective/kl": 39.71128845214844, + "objective/non_score_reward": -1.9855643510818481, + "objective/rlhf_reward": -5.542257642745971, + "objective/scores": 0.6, + "policy/approxkl_avg": 33.98119354248047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.681640625, + "step": 1421, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9983115196228027 + }, + { + "episode": 22768, + "epoch": 0.13641538148134832, + "loss/policy_avg": 1.4271972179412842, + "lr": 9.091257668711657e-06, + "objective/entropy": -200.30636596679688, + "objective/kl": 37.72491455078125, + "objective/non_score_reward": -1.8862457275390625, + "objective/rlhf_reward": -5.422276677862678, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 4.053761005401611, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6171875, + "step": 1422, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0006790161132812 + }, + { + "episode": 22784, + "epoch": 0.13651124612047788, + "loss/policy_avg": 0.14689955115318298, + "lr": 9.090618609406954e-06, + "objective/entropy": -185.34646606445312, + "objective/kl": 29.554128646850586, + "objective/non_score_reward": -1.4777064323425293, + "objective/rlhf_reward": -3.7881193778672557, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.1496503353118896, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.537109375, + "step": 1423, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997967004776001 + }, + { + "episode": 22800, + "epoch": 0.13660711075960744, + "loss/policy_avg": 1.6550846099853516, + "lr": 9.08997955010225e-06, + "objective/entropy": -237.4461669921875, + "objective/kl": 24.241506576538086, + "objective/non_score_reward": -1.2120752334594727, + "objective/rlhf_reward": -3.5227884388267228, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 2.2821950912475586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.623046875, + "step": 1424, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0027098655700684 + }, + { + "episode": 22816, + "epoch": 0.136702975398737, + "loss/policy_avg": 0.04801030457019806, + "lr": 9.089340490797546e-06, + "objective/entropy": -192.99920654296875, + "objective/kl": 42.85979461669922, + "objective/non_score_reward": -2.1429896354675293, + "objective/rlhf_reward": -7.0910067586258645, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 7.908871650695801, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.640625, + "step": 1425, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9976098537445068 + }, + { + "episode": 22832, + "epoch": 0.13679884003786652, + "loss/policy_avg": 0.773149847984314, + "lr": 9.088701431492843e-06, + "objective/entropy": -204.166015625, + "objective/kl": 33.09935760498047, + "objective/non_score_reward": -1.6549677848815918, + "objective/rlhf_reward": -4.219871020317077, + "objective/scores": 0.6, + "policy/approxkl_avg": 20.173633575439453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69921875, + "step": 1426, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9965462684631348 + }, + { + "episode": 22848, + "epoch": 0.13689470467699608, + "loss/policy_avg": 0.1709783971309662, + "lr": 9.08806237218814e-06, + "objective/entropy": -294.40911865234375, + "objective/kl": 33.224021911621094, + "objective/non_score_reward": -1.6612012386322021, + "objective/rlhf_reward": -5.244805312156677, + "objective/scores": 0.35, + "policy/approxkl_avg": 8.916924476623535, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62109375, + "step": 1427, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989229440689087 + }, + { + "episode": 22864, + "epoch": 0.13699056931612563, + "loss/policy_avg": -0.012170173227787018, + "lr": 9.087423312883437e-06, + "objective/entropy": -246.09201049804688, + "objective/kl": 36.022274017333984, + "objective/non_score_reward": -1.8011138439178467, + "objective/rlhf_reward": -5.862819781809478, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 1.0541167259216309, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.646484375, + "step": 1428, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0022614002227783 + }, + { + "episode": 22880, + "epoch": 0.1370864339552552, + "loss/policy_avg": -0.04561644792556763, + "lr": 9.086784253578734e-06, + "objective/entropy": -243.6079559326172, + "objective/kl": 26.140531539916992, + "objective/non_score_reward": -1.3070266246795654, + "objective/rlhf_reward": -3.7123345372998084, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 66.2858657836914, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6484375, + "step": 1429, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988305568695068 + }, + { + "episode": 22896, + "epoch": 0.13718229859438472, + "loss/policy_avg": -0.43349897861480713, + "lr": 9.086145194274029e-06, + "objective/entropy": -226.036376953125, + "objective/kl": 34.60681915283203, + "objective/non_score_reward": -1.730340838432312, + "objective/rlhf_reward": -5.440410855229258, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 9.1111478805542, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.591796875, + "step": 1430, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0083601474761963 + }, + { + "episode": 22912, + "epoch": 0.13727816323351427, + "loss/policy_avg": -0.0003149360418319702, + "lr": 9.085506134969326e-06, + "objective/entropy": -238.45263671875, + "objective/kl": 36.35710144042969, + "objective/non_score_reward": -1.817854881286621, + "objective/rlhf_reward": -5.755647504123386, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 9.317008018493652, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.525390625, + "step": 1431, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985694885253906 + }, + { + "episode": 22928, + "epoch": 0.13737402787264383, + "loss/policy_avg": 0.06102012097835541, + "lr": 9.084867075664623e-06, + "objective/entropy": -249.6441650390625, + "objective/kl": 30.982303619384766, + "objective/non_score_reward": -1.5491151809692383, + "objective/rlhf_reward": -6.1964609026908875, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.687659502029419, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.580078125, + "step": 1432, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0023670196533203 + }, + { + "episode": 22944, + "epoch": 0.13746989251177338, + "loss/policy_avg": 0.4118673503398895, + "lr": 9.08422801635992e-06, + "objective/entropy": -208.8826446533203, + "objective/kl": 30.2443790435791, + "objective/non_score_reward": -1.512218952178955, + "objective/rlhf_reward": -3.926169695631538, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.5902602672576904, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.60546875, + "step": 1433, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0101170539855957 + }, + { + "episode": 22960, + "epoch": 0.1375657571509029, + "loss/policy_avg": 0.48478835821151733, + "lr": 9.083588957055215e-06, + "objective/entropy": -193.95437622070312, + "objective/kl": 32.08013153076172, + "objective/non_score_reward": -1.6040066480636597, + "objective/rlhf_reward": -2.0160264730453488, + "objective/scores": 1.1, + "policy/approxkl_avg": 3.34360408782959, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.658203125, + "step": 1434, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999372959136963 + }, + { + "episode": 22976, + "epoch": 0.13766162179003247, + "loss/policy_avg": -0.11271242052316666, + "lr": 9.082949897750512e-06, + "objective/entropy": -230.10296630859375, + "objective/kl": 24.382526397705078, + "objective/non_score_reward": -1.2191263437271118, + "objective/rlhf_reward": -4.876505374908447, + "objective/scores": 0.0, + "policy/approxkl_avg": 127.57635498046875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.701171875, + "step": 1435, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.004199504852295 + }, + { + "episode": 22992, + "epoch": 0.13775748642916202, + "loss/policy_avg": 0.06364642083644867, + "lr": 9.082310838445809e-06, + "objective/entropy": -229.41696166992188, + "objective/kl": 41.24948501586914, + "objective/non_score_reward": -2.062474489212036, + "objective/rlhf_reward": -6.908262303381591, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 5.2524919509887695, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.732421875, + "step": 1436, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999943733215332 + }, + { + "episode": 23008, + "epoch": 0.13785335106829158, + "loss/policy_avg": 0.48797979950904846, + "lr": 9.081671779141104e-06, + "objective/entropy": -276.9367370605469, + "objective/kl": 39.9646110534668, + "objective/non_score_reward": -1.9982305765151978, + "objective/rlhf_reward": -6.331062798917877, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 0.5203732252120972, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.705078125, + "step": 1437, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0025830268859863 + }, + { + "episode": 23024, + "epoch": 0.1379492157074211, + "loss/policy_avg": -0.24298250675201416, + "lr": 9.0810327198364e-06, + "objective/entropy": -105.76115417480469, + "objective/kl": 33.14936447143555, + "objective/non_score_reward": -1.657468318939209, + "objective/rlhf_reward": -5.0257530546823315, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 8.427331924438477, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.57421875, + "step": 1438, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.009620428085327 + }, + { + "episode": 23040, + "epoch": 0.13804508034655066, + "loss/policy_avg": 0.11671873927116394, + "lr": 9.080393660531698e-06, + "objective/entropy": -207.38742065429688, + "objective/kl": 25.86302375793457, + "objective/non_score_reward": -1.2931511402130127, + "objective/rlhf_reward": -3.7487728192406573, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 2.233325481414795, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 1439, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.003443717956543 + }, + { + "episode": 23056, + "epoch": 0.13814094498568022, + "loss/policy_avg": 0.32801347970962524, + "lr": 9.079754601226994e-06, + "objective/entropy": -256.9577941894531, + "objective/kl": 46.250244140625, + "objective/non_score_reward": -2.312512159347534, + "objective/rlhf_reward": -7.30263740845197, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 13.493009567260742, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69921875, + "step": 1440, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9997389316558838 + }, + { + "episode": 23072, + "epoch": 0.13823680962480978, + "loss/policy_avg": 0.11888322979211807, + "lr": 9.079115541922291e-06, + "objective/entropy": -188.8265380859375, + "objective/kl": 36.61035919189453, + "objective/non_score_reward": -1.8305180072784424, + "objective/rlhf_reward": -5.717951808039265, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 4.184604644775391, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5703125, + "step": 1441, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999384880065918 + }, + { + "episode": 23088, + "epoch": 0.13833267426393933, + "loss/policy_avg": 1.4782209396362305, + "lr": 9.078476482617588e-06, + "objective/entropy": -250.94830322265625, + "objective/kl": 32.46335983276367, + "objective/non_score_reward": -1.6231679916381836, + "objective/rlhf_reward": -5.04207394561325, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 12.02247428894043, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71484375, + "step": 1442, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9968842267990112 + }, + { + "episode": 23104, + "epoch": 0.13842853890306886, + "loss/policy_avg": 0.3007156252861023, + "lr": 9.077837423312883e-06, + "objective/entropy": -287.5181884765625, + "objective/kl": 36.19750213623047, + "objective/non_score_reward": -1.8098750114440918, + "objective/rlhf_reward": -2.839500284194946, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.40210723876953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.638671875, + "step": 1443, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0000827312469482 + }, + { + "episode": 23120, + "epoch": 0.13852440354219842, + "loss/policy_avg": 0.04818664491176605, + "lr": 9.07719836400818e-06, + "objective/entropy": -213.17276000976562, + "objective/kl": 29.73092269897461, + "objective/non_score_reward": -1.4865461587905884, + "objective/rlhf_reward": -4.284325008810149, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 7.239911079406738, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.703125, + "step": 1444, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0012645721435547 + }, + { + "episode": 23136, + "epoch": 0.13862026818132797, + "loss/policy_avg": -0.07083216309547424, + "lr": 9.076559304703477e-06, + "objective/entropy": -245.51406860351562, + "objective/kl": 30.533039093017578, + "objective/non_score_reward": -1.5266518592834473, + "objective/rlhf_reward": -4.706607258319854, + "objective/scores": 0.35, + "policy/approxkl_avg": 18.655948638916016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.76953125, + "step": 1445, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989795684814453 + }, + { + "episode": 23152, + "epoch": 0.13871613282045753, + "loss/policy_avg": -0.036476410925388336, + "lr": 9.075920245398774e-06, + "objective/entropy": -170.21502685546875, + "objective/kl": 32.540916442871094, + "objective/non_score_reward": -1.62704598903656, + "objective/rlhf_reward": -5.182671103507204, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 32.26573181152344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 1446, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000288963317871 + }, + { + "episode": 23168, + "epoch": 0.13881199745958706, + "loss/policy_avg": -0.017804868519306183, + "lr": 9.075281186094071e-06, + "objective/entropy": -137.2032470703125, + "objective/kl": 43.56850814819336, + "objective/non_score_reward": -2.1784255504608154, + "objective/rlhf_reward": -7.263103942485198, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 6.8924241065979, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.701171875, + "step": 1447, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0064001083374023 + }, + { + "episode": 23184, + "epoch": 0.1389078620987166, + "loss/policy_avg": -0.14877469837665558, + "lr": 9.074642126789366e-06, + "objective/entropy": -225.8348388671875, + "objective/kl": 37.99517059326172, + "objective/non_score_reward": -1.8997586965560913, + "objective/rlhf_reward": -5.476328434721504, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 12.023405075073242, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.52734375, + "step": 1448, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.999727725982666 + }, + { + "episode": 23200, + "epoch": 0.13900372673784617, + "loss/policy_avg": 0.5529218316078186, + "lr": 9.074003067484663e-06, + "objective/entropy": -227.67355346679688, + "objective/kl": 37.310272216796875, + "objective/non_score_reward": -1.865513801574707, + "objective/rlhf_reward": -7.462055325508118, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.481158256530762, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.63671875, + "step": 1449, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991602897644043 + }, + { + "episode": 23216, + "epoch": 0.13909959137697572, + "loss/policy_avg": 0.12435504049062729, + "lr": 9.07336400817996e-06, + "objective/entropy": -278.33416748046875, + "objective/kl": 31.698009490966797, + "objective/non_score_reward": -1.5849003791809082, + "objective/rlhf_reward": -4.216895284430061, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.932787895202637, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.560546875, + "step": 1450, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9972891807556152 + }, + { + "episode": 23232, + "epoch": 0.13919545601610525, + "loss/policy_avg": 0.281582236289978, + "lr": 9.072724948875257e-06, + "objective/entropy": -217.79067993164062, + "objective/kl": 24.10039520263672, + "objective/non_score_reward": -1.2050197124481201, + "objective/rlhf_reward": -3.2159590459504894, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 10.788034439086914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 1451, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0001490116119385 + }, + { + "episode": 23248, + "epoch": 0.1392913206552348, + "loss/policy_avg": 0.29366064071655273, + "lr": 9.072085889570554e-06, + "objective/entropy": -186.27256774902344, + "objective/kl": 31.6138858795166, + "objective/non_score_reward": -1.580694317817688, + "objective/rlhf_reward": -4.981141617804198, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 34.03179931640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6328125, + "step": 1452, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.998284101486206 + }, + { + "episode": 23264, + "epoch": 0.13938718529436436, + "loss/policy_avg": -0.07383158057928085, + "lr": 9.07144683026585e-06, + "objective/entropy": -212.43728637695312, + "objective/kl": 32.51192092895508, + "objective/non_score_reward": -1.625596046447754, + "objective/rlhf_reward": -4.84052479785739, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 12.179756164550781, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.56640625, + "step": 1453, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0035171508789062 + }, + { + "episode": 23280, + "epoch": 0.13948304993349392, + "loss/policy_avg": 0.3167204260826111, + "lr": 9.070807770961146e-06, + "objective/entropy": -238.9555206298828, + "objective/kl": 32.64189529418945, + "objective/non_score_reward": -1.6320947408676147, + "objective/rlhf_reward": -4.128379082679748, + "objective/scores": 0.6, + "policy/approxkl_avg": 23.131351470947266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.666015625, + "step": 1454, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998004674911499 + }, + { + "episode": 23296, + "epoch": 0.13957891457262345, + "loss/policy_avg": 0.23313897848129272, + "lr": 9.070168711656443e-06, + "objective/entropy": -61.311492919921875, + "objective/kl": 33.116355895996094, + "objective/non_score_reward": -1.6558178663253784, + "objective/rlhf_reward": -5.297758612662477, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 35.88987350463867, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.880859375, + "step": 1455, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9964382648468018 + }, + { + "episode": 23312, + "epoch": 0.139674779211753, + "loss/policy_avg": 0.24700571596622467, + "lr": 9.069529652351738e-06, + "objective/entropy": -255.76536560058594, + "objective/kl": 32.391082763671875, + "objective/non_score_reward": -1.6195542812347412, + "objective/rlhf_reward": -5.136581590681701, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 18.76715087890625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.658203125, + "step": 1456, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971431493759155 + }, + { + "episode": 23328, + "epoch": 0.13977064385088256, + "loss/policy_avg": 0.15339264273643494, + "lr": 9.068890593047035e-06, + "objective/entropy": -262.24359130859375, + "objective/kl": 44.425445556640625, + "objective/non_score_reward": -2.2212722301483154, + "objective/rlhf_reward": -7.328829615321711, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 10.498491287231445, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.578125, + "step": 1457, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.99570894241333 + }, + { + "episode": 23344, + "epoch": 0.13986650849001211, + "loss/policy_avg": 0.2555537521839142, + "lr": 9.068251533742332e-06, + "objective/entropy": -233.40225219726562, + "objective/kl": 32.24791717529297, + "objective/non_score_reward": -1.6123958826065063, + "objective/rlhf_reward": -4.998985271067962, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 26.18222999572754, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.591796875, + "step": 1458, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9968748092651367 + }, + { + "episode": 23360, + "epoch": 0.13996237312914164, + "loss/policy_avg": -0.060967281460762024, + "lr": 9.067612474437628e-06, + "objective/entropy": -264.13665771484375, + "objective/kl": 40.685516357421875, + "objective/non_score_reward": -2.034276008605957, + "objective/rlhf_reward": -6.014397802130256, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 3.826961040496826, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7109375, + "step": 1459, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0120983123779297 + }, + { + "episode": 23376, + "epoch": 0.1400582377682712, + "loss/policy_avg": 0.20050451159477234, + "lr": 9.066973415132925e-06, + "objective/entropy": -100.12745666503906, + "objective/kl": 37.77911376953125, + "objective/non_score_reward": -1.88895583152771, + "objective/rlhf_reward": -6.177221098033291, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 11.450329780578613, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.84375, + "step": 1460, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0007660388946533 + }, + { + "episode": 23392, + "epoch": 0.14015410240740075, + "loss/policy_avg": 0.27806586027145386, + "lr": 9.06633435582822e-06, + "objective/entropy": -227.0691375732422, + "objective/kl": 31.303085327148438, + "objective/non_score_reward": -1.5651543140411377, + "objective/rlhf_reward": -4.935104284316225, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 1.6722452640533447, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.544921875, + "step": 1461, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002023696899414 + }, + { + "episode": 23408, + "epoch": 0.1402499670465303, + "loss/policy_avg": 0.4921458065509796, + "lr": 9.065695296523517e-06, + "objective/entropy": -282.57427978515625, + "objective/kl": 26.013471603393555, + "objective/non_score_reward": -1.3006736040115356, + "objective/rlhf_reward": -2.2789752229463787, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 4.31304931640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.73046875, + "step": 1462, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9963445663452148 + }, + { + "episode": 23424, + "epoch": 0.14034583168565984, + "loss/policy_avg": 0.3357080817222595, + "lr": 9.065056237218814e-06, + "objective/entropy": -292.97235107421875, + "objective/kl": 25.093395233154297, + "objective/non_score_reward": -1.2546697854995728, + "objective/rlhf_reward": -5.018679141998291, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.99601936340332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73046875, + "step": 1463, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9978545904159546 + }, + { + "episode": 23440, + "epoch": 0.1404416963247894, + "loss/policy_avg": 0.2813834846019745, + "lr": 9.064417177914111e-06, + "objective/entropy": -208.256103515625, + "objective/kl": 36.03406524658203, + "objective/non_score_reward": -1.8017032146453857, + "objective/rlhf_reward": -5.47347940603892, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 30.506061553955078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.73828125, + "step": 1464, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.998169183731079 + }, + { + "episode": 23456, + "epoch": 0.14053756096391895, + "loss/policy_avg": 0.2761915922164917, + "lr": 9.063778118609408e-06, + "objective/entropy": -172.63931274414062, + "objective/kl": 33.94431686401367, + "objective/non_score_reward": -1.6972159147262573, + "objective/rlhf_reward": -5.365031678875057, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 4.0666117668151855, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.662109375, + "step": 1465, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994304180145264 + }, + { + "episode": 23472, + "epoch": 0.1406334256030485, + "loss/policy_avg": 0.667277455329895, + "lr": 9.063139059304705e-06, + "objective/entropy": -179.61021423339844, + "objective/kl": 30.94757843017578, + "objective/non_score_reward": -1.5473790168762207, + "objective/rlhf_reward": -4.8109142566598475, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 1.069715976715088, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.662109375, + "step": 1466, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0003573894500732 + }, + { + "episode": 23488, + "epoch": 0.14072929024217803, + "loss/policy_avg": 0.03628428280353546, + "lr": 9.0625e-06, + "objective/entropy": -235.0047607421875, + "objective/kl": 34.63279342651367, + "objective/non_score_reward": -1.7316396236419678, + "objective/rlhf_reward": -4.52655873298645, + "objective/scores": 0.6, + "policy/approxkl_avg": 58.85393524169922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.734375, + "step": 1467, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000277280807495 + }, + { + "episode": 23504, + "epoch": 0.1408251548813076, + "loss/policy_avg": 0.2930186688899994, + "lr": 9.061860940695297e-06, + "objective/entropy": -311.19976806640625, + "objective/kl": 39.07278060913086, + "objective/non_score_reward": -1.953639030456543, + "objective/rlhf_reward": -6.258296697345331, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 5.577837944030762, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.654296875, + "step": 1468, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985158443450928 + }, + { + "episode": 23520, + "epoch": 0.14092101952043715, + "loss/policy_avg": -0.16989761590957642, + "lr": 9.061221881390594e-06, + "objective/entropy": -221.3096923828125, + "objective/kl": 34.3406982421875, + "objective/non_score_reward": -1.7170348167419434, + "objective/rlhf_reward": -4.468139624595642, + "objective/scores": 0.6, + "policy/approxkl_avg": 24.528539657592773, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.650390625, + "step": 1469, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.003714084625244 + }, + { + "episode": 23536, + "epoch": 0.1410168841595667, + "loss/policy_avg": 0.5539761781692505, + "lr": 9.06058282208589e-06, + "objective/entropy": -208.52297973632812, + "objective/kl": 31.98680877685547, + "objective/non_score_reward": -1.5993404388427734, + "objective/rlhf_reward": -5.038111769889278, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.0958806276321411, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.583984375, + "step": 1470, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0022308826446533 + }, + { + "episode": 23552, + "epoch": 0.14111274879869623, + "loss/policy_avg": 0.18784965574741364, + "lr": 9.059943762781188e-06, + "objective/entropy": -209.775146484375, + "objective/kl": 32.73426818847656, + "objective/non_score_reward": -1.6367132663726807, + "objective/rlhf_reward": -5.205217769652037, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 14.739479064941406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.830078125, + "step": 1471, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9984816312789917 + }, + { + "episode": 23568, + "epoch": 0.1412086134378258, + "loss/policy_avg": -0.015954041853547096, + "lr": 9.059304703476484e-06, + "objective/entropy": -266.16412353515625, + "objective/kl": 41.6230354309082, + "objective/non_score_reward": -2.0811514854431152, + "objective/rlhf_reward": -6.965356790755672, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 2.9907431602478027, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.751953125, + "step": 1472, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9974491596221924 + }, + { + "episode": 23584, + "epoch": 0.14130447807695534, + "loss/policy_avg": 0.38890203833580017, + "lr": 9.05866564417178e-06, + "objective/entropy": -276.6087951660156, + "objective/kl": 31.15995216369629, + "objective/non_score_reward": -1.5579975843429565, + "objective/rlhf_reward": -4.407161588939737, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 27.55563735961914, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.587890625, + "step": 1473, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9981493949890137 + }, + { + "episode": 23600, + "epoch": 0.1414003427160849, + "loss/policy_avg": 0.47803691029548645, + "lr": 9.058026584867077e-06, + "objective/entropy": -212.98114013671875, + "objective/kl": 45.53947067260742, + "objective/non_score_reward": -2.2769737243652344, + "objective/rlhf_reward": -7.782381329566164, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 18.62641716003418, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.689453125, + "step": 1474, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9991447925567627 + }, + { + "episode": 23616, + "epoch": 0.14149620735521443, + "loss/policy_avg": -0.034300077706575394, + "lr": 9.057387525562373e-06, + "objective/entropy": -228.06358337402344, + "objective/kl": 31.04609489440918, + "objective/non_score_reward": -1.5523046255111694, + "objective/rlhf_reward": -4.883705649405641, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 2.5281810760498047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 1475, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0011422634124756 + }, + { + "episode": 23632, + "epoch": 0.14159207199434398, + "loss/policy_avg": 0.021888693794608116, + "lr": 9.05674846625767e-06, + "objective/entropy": -190.63259887695312, + "objective/kl": 25.703638076782227, + "objective/non_score_reward": -1.2851818799972534, + "objective/rlhf_reward": -3.4788681320553883, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 1.3920494318008423, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.521484375, + "step": 1476, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001204013824463 + }, + { + "episode": 23648, + "epoch": 0.14168793663347354, + "loss/policy_avg": 0.5509345531463623, + "lr": 9.056109406952967e-06, + "objective/entropy": -259.56756591796875, + "objective/kl": 34.98920440673828, + "objective/non_score_reward": -1.749460220336914, + "objective/rlhf_reward": -5.335981493414031, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 10.9700927734375, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.57421875, + "step": 1477, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0011067390441895 + }, + { + "episode": 23664, + "epoch": 0.1417838012726031, + "loss/policy_avg": -0.3546954095363617, + "lr": 9.055470347648262e-06, + "objective/entropy": -182.73776245117188, + "objective/kl": 40.87359619140625, + "objective/non_score_reward": -2.043679714202881, + "objective/rlhf_reward": -5.251000319362852, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 18.994150161743164, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.83203125, + "step": 1478, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9988384246826172 + }, + { + "episode": 23680, + "epoch": 0.14187966591173262, + "loss/policy_avg": 0.18590596318244934, + "lr": 9.05483128834356e-06, + "objective/entropy": -204.18365478515625, + "objective/kl": 27.81656837463379, + "objective/non_score_reward": -1.3908284902572632, + "objective/rlhf_reward": -4.204064035151882, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.7427480220794678, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.595703125, + "step": 1479, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998271465301514 + }, + { + "episode": 23696, + "epoch": 0.14197553055086218, + "loss/policy_avg": 2.363548755645752, + "lr": 9.054192229038854e-06, + "objective/entropy": -259.5701599121094, + "objective/kl": 34.00105285644531, + "objective/non_score_reward": -1.7000526189804077, + "objective/rlhf_reward": -4.8527993661927535, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 24.17676544189453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.705078125, + "step": 1480, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9970941543579102 + }, + { + "episode": 23712, + "epoch": 0.14207139518999173, + "loss/policy_avg": 0.10088340193033218, + "lr": 9.053553169734151e-06, + "objective/entropy": -170.8048095703125, + "objective/kl": 33.465816497802734, + "objective/non_score_reward": -1.6732908487319946, + "objective/rlhf_reward": -5.367650661498232, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 3.380504608154297, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.56640625, + "step": 1481, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998365640640259 + }, + { + "episode": 23728, + "epoch": 0.1421672598291213, + "loss/policy_avg": 0.303548663854599, + "lr": 9.052914110429448e-06, + "objective/entropy": -291.6600341796875, + "objective/kl": 36.511512756347656, + "objective/non_score_reward": -1.8255757093429565, + "objective/rlhf_reward": -5.878470738132563, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 22.745920181274414, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5703125, + "step": 1482, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9991157054901123 + }, + { + "episode": 23744, + "epoch": 0.14226312446825082, + "loss/policy_avg": 0.6809051632881165, + "lr": 9.052275051124745e-06, + "objective/entropy": -224.0225830078125, + "objective/kl": 27.76144790649414, + "objective/non_score_reward": -1.3880724906921387, + "objective/rlhf_reward": -4.036518180164036, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 1.4273369312286377, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.64453125, + "step": 1483, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000868082046509 + }, + { + "episode": 23760, + "epoch": 0.14235898910738037, + "loss/policy_avg": 0.5665885210037231, + "lr": 9.051635991820042e-06, + "objective/entropy": -210.581298828125, + "objective/kl": 42.927467346191406, + "objective/non_score_reward": -2.1463735103607178, + "objective/rlhf_reward": -7.134895424456939, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 106.22406005859375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.75, + "step": 1484, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9954371452331543 + }, + { + "episode": 23776, + "epoch": 0.14245485374650993, + "loss/policy_avg": 0.09896639734506607, + "lr": 9.050996932515339e-06, + "objective/entropy": -248.680419921875, + "objective/kl": 31.273435592651367, + "objective/non_score_reward": -1.5636719465255737, + "objective/rlhf_reward": -4.830855686863033, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 124.75199890136719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73046875, + "step": 1485, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998034954071045 + }, + { + "episode": 23792, + "epoch": 0.14255071838563949, + "loss/policy_avg": 0.06345228850841522, + "lr": 9.050357873210634e-06, + "objective/entropy": -207.33094787597656, + "objective/kl": 41.283634185791016, + "objective/non_score_reward": -2.0641818046569824, + "objective/rlhf_reward": -6.775774124081492, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 1.8670159578323364, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.63671875, + "step": 1486, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999525547027588 + }, + { + "episode": 23808, + "epoch": 0.142646583024769, + "loss/policy_avg": -0.08927027136087418, + "lr": 9.049718813905931e-06, + "objective/entropy": -192.79010009765625, + "objective/kl": 38.38804626464844, + "objective/non_score_reward": -1.9194023609161377, + "objective/rlhf_reward": -6.073489222590046, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.219352960586548, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.607421875, + "step": 1487, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.005600929260254 + }, + { + "episode": 23824, + "epoch": 0.14274244766389857, + "loss/policy_avg": 0.17106056213378906, + "lr": 9.049079754601228e-06, + "objective/entropy": -174.7689208984375, + "objective/kl": 33.021854400634766, + "objective/non_score_reward": -1.6510926485061646, + "objective/rlhf_reward": -4.871037379900614, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.4489188194274902, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6171875, + "step": 1488, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988312721252441 + }, + { + "episode": 23840, + "epoch": 0.14283831230302813, + "loss/policy_avg": -0.03305444121360779, + "lr": 9.048440695296525e-06, + "objective/entropy": -201.0426788330078, + "objective/kl": 38.58399200439453, + "objective/non_score_reward": -1.9291996955871582, + "objective/rlhf_reward": -5.316798543930053, + "objective/scores": 0.6, + "policy/approxkl_avg": 23.38889503479004, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58203125, + "step": 1489, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998862385749817 + }, + { + "episode": 23856, + "epoch": 0.14293417694215768, + "loss/policy_avg": -0.13663126528263092, + "lr": 9.047801635991821e-06, + "objective/entropy": -128.63768005371094, + "objective/kl": 31.9277400970459, + "objective/non_score_reward": -1.5963871479034424, + "objective/rlhf_reward": -4.869776809009251, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 67.26014709472656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 1490, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001464366912842 + }, + { + "episode": 23872, + "epoch": 0.1430300415812872, + "loss/policy_avg": -0.10490886867046356, + "lr": 9.047162576687117e-06, + "objective/entropy": -172.6685791015625, + "objective/kl": 40.15425491333008, + "objective/non_score_reward": -2.0077128410339355, + "objective/rlhf_reward": -6.206022734912942, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 1.7667760848999023, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.630859375, + "step": 1491, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000633716583252 + }, + { + "episode": 23888, + "epoch": 0.14312590622041677, + "loss/policy_avg": -0.010392919182777405, + "lr": 9.046523517382414e-06, + "objective/entropy": -114.63517761230469, + "objective/kl": 42.928489685058594, + "objective/non_score_reward": -2.1464245319366455, + "objective/rlhf_reward": -5.6619793518793315, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 23.010705947875977, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.806640625, + "step": 1492, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000854015350342 + }, + { + "episode": 23904, + "epoch": 0.14322177085954632, + "loss/policy_avg": 0.33309632539749146, + "lr": 9.04588445807771e-06, + "objective/entropy": -254.52764892578125, + "objective/kl": 36.419219970703125, + "objective/non_score_reward": -1.8209609985351562, + "objective/rlhf_reward": -7.2838438749313354, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.836235046386719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7734375, + "step": 1493, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 2.0010738372802734 + }, + { + "episode": 23920, + "epoch": 0.14331763549867588, + "loss/policy_avg": 0.7723073959350586, + "lr": 9.045245398773007e-06, + "objective/entropy": -241.48744201660156, + "objective/kl": 45.3399658203125, + "objective/non_score_reward": -2.266998291015625, + "objective/rlhf_reward": -9.06799328327179, + "objective/scores": 0.0, + "policy/approxkl_avg": 72.61679077148438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62109375, + "step": 1494, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9994328022003174 + }, + { + "episode": 23936, + "epoch": 0.1434135001378054, + "loss/policy_avg": 0.12270835041999817, + "lr": 9.044606339468304e-06, + "objective/entropy": -233.93212890625, + "objective/kl": 36.94314956665039, + "objective/non_score_reward": -1.8471574783325195, + "objective/rlhf_reward": -5.726770406187163, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 18.066661834716797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7421875, + "step": 1495, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.997254729270935 + }, + { + "episode": 23952, + "epoch": 0.14350936477693496, + "loss/policy_avg": 0.17836476862430573, + "lr": 9.043967280163601e-06, + "objective/entropy": -189.3540496826172, + "objective/kl": 33.162147521972656, + "objective/non_score_reward": -1.6581075191497803, + "objective/rlhf_reward": -2.232429778575897, + "objective/scores": 1.1, + "policy/approxkl_avg": 8.499994277954102, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.693359375, + "step": 1496, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.998523473739624 + }, + { + "episode": 23968, + "epoch": 0.14360522941606452, + "loss/policy_avg": 0.02606182172894478, + "lr": 9.043328220858896e-06, + "objective/entropy": -187.52023315429688, + "objective/kl": 27.699565887451172, + "objective/non_score_reward": -1.3849782943725586, + "objective/rlhf_reward": -3.983654229846552, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 35.55757522583008, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.681640625, + "step": 1497, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000664234161377 + }, + { + "episode": 23984, + "epoch": 0.14370109405519407, + "loss/policy_avg": 0.2732602655887604, + "lr": 9.042689161554193e-06, + "objective/entropy": -221.50460815429688, + "objective/kl": 31.577119827270508, + "objective/non_score_reward": -1.5788559913635254, + "objective/rlhf_reward": -1.9154240846633908, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.376906394958496, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.72265625, + "step": 1498, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000617027282715 + }, + { + "episode": 24000, + "epoch": 0.14379695869432363, + "loss/policy_avg": 0.20893090963363647, + "lr": 9.04205010224949e-06, + "objective/entropy": -245.84298706054688, + "objective/kl": 30.781171798706055, + "objective/non_score_reward": -1.5390586853027344, + "objective/rlhf_reward": -4.7776325727380335, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 19.28199577331543, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 1499, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9972145557403564 + } + ], + "logging_steps": 500, + "max_steps": 7824, + "num_input_tokens_seen": 0, + "num_train_epochs": 3.0, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": true, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0, + "train_batch_size": null, + "trial_name": null, + "trial_params": null +}