{
  "best_metric": null,
  "best_model_checkpoint": null,
  "episode": 16000,
  "epoch": 0.09586463912954908,
  "eval_steps": 500,
  "global_step": 1000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "episode": 16,
      "epoch": 9.586463912954908e-05,
      "loss/policy_avg": 0.015691569074988365,
      "lr": 1e-05,
      "objective/entropy": 136.889404296875,
      "objective/kl": 13.172518730163574,
      "objective/non_score_reward": -0.6586259603500366,
      "objective/rlhf_reward": -1.2559016580260813,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 330.0568542480469,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.75,
      "step": 0,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.999426007270813
    },
    {
      "episode": 32,
      "epoch": 0.00019172927825909816,
      "loss/policy_avg": 0.021727558225393295,
      "lr": 9.999360940695298e-06,
      "objective/entropy": -4.705432891845703,
      "objective/kl": 4.4086012840271,
      "objective/non_score_reward": -0.22043009102344513,
      "objective/rlhf_reward": 0.49688179692854306,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 25.247615814208984,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4375,
      "step": 1,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0005669593811035
    },
    {
      "episode": 48,
      "epoch": 0.00028759391738864725,
      "loss/policy_avg": 0.05422616004943848,
      "lr": 9.998721881390595e-06,
      "objective/entropy": 26.511795043945312,
      "objective/kl": 10.364278793334961,
      "objective/non_score_reward": -0.5182140469551086,
      "objective/rlhf_reward": -0.6222579917923059,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 174.7788543701172,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.6328125,
      "step": 2,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.001580238342285
    },
    {
      "episode": 64,
      "epoch": 0.0003834585565181963,
      "loss/policy_avg": 0.1031150370836258,
      "lr": 9.99808282208589e-06,
      "objective/entropy": -6.2874298095703125,
      "objective/kl": 7.10389518737793,
      "objective/non_score_reward": -0.35519474744796753,
      "objective/rlhf_reward": 0.24108044284523888,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 107.51742553710938,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.806640625,
      "step": 3,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9999995231628418
    },
    {
      "episode": 80,
      "epoch": 0.0004793231956477454,
      "loss/policy_avg": 0.020609447732567787,
      "lr": 9.997443762781187e-06,
      "objective/entropy": 63.54547882080078,
      "objective/kl": 1.458254337310791,
      "objective/non_score_reward": -0.07291271537542343,
      "objective/rlhf_reward": 1.224120924828116,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 14.240117073059082,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4150390625,
      "step": 4,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.000563383102417
    },
    {
      "episode": 96,
      "epoch": 0.0005751878347772945,
      "loss/policy_avg": 0.1277482807636261,
      "lr": 9.996804703476484e-06,
      "objective/entropy": 55.068546295166016,
      "objective/kl": 8.753851890563965,
      "objective/non_score_reward": -0.43769264221191406,
      "objective/rlhf_reward": -0.37216834077010735,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 100.08578491210938,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.447265625,
      "step": 5,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.999474048614502
    },
    {
      "episode": 112,
      "epoch": 0.0006710524739068436,
      "loss/policy_avg": 0.3148539662361145,
      "lr": 9.99616564417178e-06,
      "objective/entropy": 21.463600158691406,
      "objective/kl": 9.847577095031738,
      "objective/non_score_reward": -0.4923788607120514,
      "objective/rlhf_reward": -0.02210425861352272,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 82.89840698242188,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.619140625,
      "step": 6,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.998772382736206
    },
    {
      "episode": 128,
      "epoch": 0.0007669171130363926,
      "loss/policy_avg": -9.760260581970215e-06,
      "lr": 9.995526584867077e-06,
      "objective/entropy": 43.514984130859375,
      "objective/kl": 6.468422889709473,
      "objective/non_score_reward": -0.3234211802482605,
      "objective/rlhf_reward": 0.18726797867262368,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 53.660911560058594,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.595703125,
      "step": 7,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0024185180664062
    },
    {
      "episode": 144,
      "epoch": 0.0008627817521659417,
      "loss/policy_avg": 0.07420124113559723,
      "lr": 9.994887525562374e-06,
      "objective/entropy": 111.558837890625,
      "objective/kl": 5.765064716339111,
      "objective/non_score_reward": -0.2882532477378845,
      "objective/rlhf_reward": 0.7943982454372089,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 38.34186935424805,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4462890625,
      "step": 8,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9975563287734985
    },
    {
      "episode": 160,
      "epoch": 0.0009586463912954908,
      "loss/policy_avg": 0.22252294421195984,
      "lr": 9.99424846625767e-06,
      "objective/entropy": 99.2086181640625,
      "objective/kl": 8.770297050476074,
      "objective/non_score_reward": -0.4385148584842682,
      "objective/rlhf_reward": -0.35405938923358926,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 98.07421112060547,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.75,
      "step": 9,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9961905479431152
    },
    {
      "episode": 176,
      "epoch": 0.0010545110304250398,
      "loss/policy_avg": 0.05278925597667694,
      "lr": 9.993609406952966e-06,
      "objective/entropy": 192.25936889648438,
      "objective/kl": 5.483057975769043,
      "objective/non_score_reward": -0.27415287494659424,
      "objective/rlhf_reward": 1.3033885151147842,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 54.852699279785156,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.73046875,
      "step": 10,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0001378059387207
    },
    {
      "episode": 192,
      "epoch": 0.001150375669554589,
      "loss/policy_avg": 0.01604432426393032,
      "lr": 9.992970347648263e-06,
      "objective/entropy": 91.4354476928711,
      "objective/kl": 1.6482281684875488,
      "objective/non_score_reward": -0.08241140842437744,
      "objective/rlhf_reward": 1.1513069728358984,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 12.662862777709961,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.5390625,
      "step": 11,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9994404315948486
    },
    {
      "episode": 208,
      "epoch": 0.001246240308684138,
      "loss/policy_avg": 0.17367278039455414,
      "lr": 9.992331288343558e-06,
      "objective/entropy": 148.37680053710938,
      "objective/kl": 9.977045059204102,
      "objective/non_score_reward": -0.4988522529602051,
      "objective/rlhf_reward": -0.4796372515880427,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 132.6361083984375,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4619140625,
      "step": 12,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9963808059692383
    },
    {
      "episode": 224,
      "epoch": 0.0013421049478136871,
      "loss/policy_avg": -0.12138635665178299,
      "lr": 9.991692229038855e-06,
      "objective/entropy": -70.20156860351562,
      "objective/kl": 3.8376624584198,
      "objective/non_score_reward": -0.1918831169605255,
      "objective/rlhf_reward": 0.6324675619602202,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 15.127391815185547,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.505859375,
      "step": 13,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.017153739929199
    },
    {
      "episode": 240,
      "epoch": 0.001437969586943236,
      "loss/policy_avg": 0.1106414794921875,
      "lr": 9.991053169734152e-06,
      "objective/entropy": 129.54013061523438,
      "objective/kl": 12.085613250732422,
      "objective/non_score_reward": -0.6042807102203369,
      "objective/rlhf_reward": -0.6837895224491755,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 178.22561645507812,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.5390625,
      "step": 14,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.999481201171875
    },
    {
      "episode": 256,
      "epoch": 0.0015338342260727853,
      "loss/policy_avg": 0.01672934927046299,
      "lr": 9.990414110429449e-06,
      "objective/entropy": 177.98126220703125,
      "objective/kl": 7.125063896179199,
      "objective/non_score_reward": -0.3562532067298889,
      "objective/rlhf_reward": -0.025012841820716947,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 91.47238159179688,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.716796875,
      "step": 15,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.000582218170166
    },
    {
      "episode": 272,
      "epoch": 0.0016296988652023342,
      "loss/policy_avg": 0.14258402585983276,
      "lr": 9.989775051124744e-06,
      "objective/entropy": 197.2217559814453,
      "objective/kl": 12.70147705078125,
      "objective/non_score_reward": -0.6350738406181335,
      "objective/rlhf_reward": -1.1616931343949852,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 84.26277160644531,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.630859375,
      "step": 16,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9964921474456787
    },
    {
      "episode": 288,
      "epoch": 0.0017255635043318834,
      "loss/policy_avg": -0.0007228106260299683,
      "lr": 9.989135991820041e-06,
      "objective/entropy": -9.756143569946289,
      "objective/kl": 7.940765380859375,
      "objective/non_score_reward": -0.3970382809638977,
      "objective/rlhf_reward": -0.07238138595455501,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 42.61369323730469,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.64453125,
      "step": 17,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0011234283447266
    },
    {
      "episode": 304,
      "epoch": 0.0018214281434614326,
      "loss/policy_avg": 0.13892704248428345,
      "lr": 9.988496932515338e-06,
      "objective/entropy": 14.549068450927734,
      "objective/kl": 9.783748626708984,
      "objective/non_score_reward": -0.48918741941452026,
      "objective/rlhf_reward": -0.5781475538886606,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 73.81009674072266,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.607421875,
      "step": 18,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.998319149017334
    },
    {
      "episode": 320,
      "epoch": 0.0019172927825909815,
      "loss/policy_avg": 0.12347989529371262,
      "lr": 9.987857873210635e-06,
      "objective/entropy": 197.0328369140625,
      "objective/kl": 9.07555103302002,
      "objective/non_score_reward": -0.453777551651001,
      "objective/rlhf_reward": -0.15325071436225013,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 74.28388214111328,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5625,
      "step": 19,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.001952648162842
    },
    {
      "episode": 336,
      "epoch": 0.0020131574217205307,
      "loss/policy_avg": 0.06666804850101471,
      "lr": 9.987218813905932e-06,
      "objective/entropy": 180.56707763671875,
      "objective/kl": 10.346174240112305,
      "objective/non_score_reward": -0.5173087120056152,
      "objective/rlhf_reward": -0.6454025848704257,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 88.01742553710938,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.595703125,
      "step": 20,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9958171844482422
    },
    {
      "episode": 352,
      "epoch": 0.0021090220608500796,
      "loss/policy_avg": 0.12632718682289124,
      "lr": 9.986579754601228e-06,
      "objective/entropy": 165.49900817871094,
      "objective/kl": 10.707776069641113,
      "objective/non_score_reward": -0.5353888273239136,
      "objective/rlhf_reward": -0.7629530663169442,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 118.42108917236328,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.78125,
      "step": 21,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9964122772216797
    },
    {
      "episode": 368,
      "epoch": 0.0022048866999796286,
      "loss/policy_avg": 0.012576747685670853,
      "lr": 9.985940695296524e-06,
      "objective/entropy": -133.83059692382812,
      "objective/kl": 6.06254768371582,
      "objective/non_score_reward": -0.3031274080276489,
      "objective/rlhf_reward": 0.21132251183215,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 4.497255325317383,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.552734375,
      "step": 22,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0017709732055664
    },
    {
      "episode": 384,
      "epoch": 0.002300751339109178,
      "loss/policy_avg": 0.21566970646381378,
      "lr": 9.98530163599182e-06,
      "objective/entropy": 80.05180358886719,
      "objective/kl": 18.019107818603516,
      "objective/non_score_reward": -0.9009554386138916,
      "objective/rlhf_reward": -2.1799896850186267,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 244.3957061767578,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.72265625,
      "step": 23,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9975435733795166
    },
    {
      "episode": 400,
      "epoch": 0.002396615978238727,
      "loss/policy_avg": 0.21825431287288666,
      "lr": 9.984662576687117e-06,
      "objective/entropy": 22.858154296875,
      "objective/kl": 7.889187812805176,
      "objective/non_score_reward": -0.39445942640304565,
      "objective/rlhf_reward": 0.5448686011871957,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 45.33286666870117,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.54296875,
      "step": 24,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9998900890350342
    },
    {
      "episode": 416,
      "epoch": 0.002492480617368276,
      "loss/policy_avg": 0.2645857036113739,
      "lr": 9.984023517382414e-06,
      "objective/entropy": 37.619895935058594,
      "objective/kl": 11.23090934753418,
      "objective/non_score_reward": -0.5615454316139221,
      "objective/rlhf_reward": 0.15381827354431143,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 88.95787811279297,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.552734375,
      "step": 25,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.996498703956604
    },
    {
      "episode": 432,
      "epoch": 0.002588345256497825,
      "loss/policy_avg": 0.04753335565328598,
      "lr": 9.983384458077711e-06,
      "objective/entropy": 156.34921264648438,
      "objective/kl": 7.371222496032715,
      "objective/non_score_reward": -0.36856111884117126,
      "objective/rlhf_reward": -0.14873159292332616,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 35.437461853027344,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6015625,
      "step": 26,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9979305267333984
    },
    {
      "episode": 448,
      "epoch": 0.0026842098956273742,
      "loss/policy_avg": -0.010932949371635914,
      "lr": 9.982745398773006e-06,
      "objective/entropy": 16.393407821655273,
      "objective/kl": 16.967132568359375,
      "objective/non_score_reward": -0.8483567237854004,
      "objective/rlhf_reward": -2.051791122465759,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 207.71142578125,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.564453125,
      "step": 27,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9935011863708496
    },
    {
      "episode": 464,
      "epoch": 0.002780074534756923,
      "loss/policy_avg": 0.23893436789512634,
      "lr": 9.982106339468303e-06,
      "objective/entropy": 170.59136962890625,
      "objective/kl": 15.129783630371094,
      "objective/non_score_reward": -0.7564891576766968,
      "objective/rlhf_reward": -1.469697265830591,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 135.97763061523438,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.72265625,
      "step": 28,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9975056648254395
    },
    {
      "episode": 480,
      "epoch": 0.002875939173886472,
      "loss/policy_avg": 0.03272615000605583,
      "lr": 9.9814672801636e-06,
      "objective/entropy": 6.700323104858398,
      "objective/kl": 10.701581954956055,
      "objective/non_score_reward": -0.5350791215896606,
      "objective/rlhf_reward": -0.6897181971982564,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 63.513145446777344,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.60546875,
      "step": 29,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.998331069946289
    },
    {
      "episode": 496,
      "epoch": 0.0029718038130160216,
      "loss/policy_avg": 0.07188314199447632,
      "lr": 9.980828220858897e-06,
      "objective/entropy": -47.331199645996094,
      "objective/kl": 12.874979019165039,
      "objective/non_score_reward": -0.6437489986419678,
      "objective/rlhf_reward": -1.1963937664903224,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 77.876220703125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.5390625,
      "step": 30,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9967491626739502
    },
    {
      "episode": 512,
      "epoch": 0.0030676684521455705,
      "loss/policy_avg": 0.04047826677560806,
      "lr": 9.980189161554194e-06,
      "objective/entropy": 282.3853759765625,
      "objective/kl": 9.654375076293945,
      "objective/non_score_reward": -0.4827187657356262,
      "objective/rlhf_reward": -0.5716251668676566,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 64.11791229248047,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.89453125,
      "step": 31,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9997191429138184
    },
    {
      "episode": 528,
      "epoch": 0.0031635330912751195,
      "loss/policy_avg": 0.07097287476062775,
      "lr": 9.97955010224949e-06,
      "objective/entropy": 116.042236328125,
      "objective/kl": 14.595599174499512,
      "objective/non_score_reward": -0.7297799587249756,
      "objective/rlhf_reward": -0.7964137478926516,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 272.6925048828125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.3857421875,
      "step": 32,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0027289390563965
    },
    {
      "episode": 544,
      "epoch": 0.0032593977304046684,
      "loss/policy_avg": 0.5246497392654419,
      "lr": 9.978911042944786e-06,
      "objective/entropy": 8.318304061889648,
      "objective/kl": 16.622827529907227,
      "objective/non_score_reward": -0.831141471862793,
      "objective/rlhf_reward": -1.9990529752074906,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 159.0550079345703,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.703125,
      "step": 33,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9971305131912231
    },
    {
      "episode": 560,
      "epoch": 0.003355262369534218,
      "loss/policy_avg": 0.20073390007019043,
      "lr": 9.978271983640083e-06,
      "objective/entropy": 92.97464752197266,
      "objective/kl": 10.66767692565918,
      "objective/non_score_reward": -0.5333837866783142,
      "objective/rlhf_reward": 2.2664648383855823,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 89.14144134521484,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.544921875,
      "step": 34,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.000143051147461
    },
    {
      "episode": 576,
      "epoch": 0.0034511270086637668,
      "loss/policy_avg": 0.04765152558684349,
      "lr": 9.977632924335378e-06,
      "objective/entropy": 149.43089294433594,
      "objective/kl": 16.67333221435547,
      "objective/non_score_reward": -0.8336665630340576,
      "objective/rlhf_reward": -0.9346663713455201,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 189.3590850830078,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4765625,
      "step": 35,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9986698627471924
    },
    {
      "episode": 592,
      "epoch": 0.0035469916477933157,
      "loss/policy_avg": 0.40008074045181274,
      "lr": 9.976993865030675e-06,
      "objective/entropy": 157.10501098632812,
      "objective/kl": 13.927867889404297,
      "objective/non_score_reward": -0.6963933706283569,
      "objective/rlhf_reward": -1.406971328941685,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 121.78231811523438,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.70703125,
      "step": 36,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9974275827407837
    },
    {
      "episode": 608,
      "epoch": 0.003642856286922865,
      "loss/policy_avg": 0.08663024008274078,
      "lr": 9.976354805725972e-06,
      "objective/entropy": 47.76446533203125,
      "objective/kl": 13.560833930969238,
      "objective/non_score_reward": -0.6780416965484619,
      "objective/rlhf_reward": -0.5894605539002753,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 43.71810531616211,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5078125,
      "step": 37,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9991490840911865
    },
    {
      "episode": 624,
      "epoch": 0.003738720926052414,
      "loss/policy_avg": 0.08268876373767853,
      "lr": 9.975715746421269e-06,
      "objective/entropy": 192.41729736328125,
      "objective/kl": 6.687016010284424,
      "objective/non_score_reward": -0.3343508243560791,
      "objective/rlhf_reward": 0.021846643354015427,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 67.82701873779297,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.619140625,
      "step": 38,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.999939203262329
    },
    {
      "episode": 640,
      "epoch": 0.003834585565181963,
      "loss/policy_avg": 0.05995899811387062,
      "lr": 9.975076687116566e-06,
      "objective/entropy": -98.350341796875,
      "objective/kl": 9.015666961669922,
      "objective/non_score_reward": -0.450783371925354,
      "objective/rlhf_reward": 0.14427768908268623,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 51.733055114746094,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5859375,
      "step": 39,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9974713325500488
    },
    {
      "episode": 656,
      "epoch": 0.003930450204311512,
      "loss/policy_avg": 0.18854951858520508,
      "lr": 9.97443762781186e-06,
      "objective/entropy": 141.67947387695312,
      "objective/kl": 10.309185028076172,
      "objective/non_score_reward": -0.5154592990875244,
      "objective/rlhf_reward": -0.6618371069431306,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 71.02857208251953,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.744140625,
      "step": 40,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9993364810943604
    },
    {
      "episode": 672,
      "epoch": 0.004026314843441061,
      "loss/policy_avg": 0.05062849074602127,
      "lr": 9.973798568507158e-06,
      "objective/entropy": -38.6858024597168,
      "objective/kl": 9.445882797241211,
      "objective/non_score_reward": -0.4722941517829895,
      "objective/rlhf_reward": -1.8891766667366028,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 5.4856438636779785,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.62890625,
      "step": 41,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9984209537506104
    },
    {
      "episode": 688,
      "epoch": 0.00412217948257061,
      "loss/policy_avg": 0.09501229226589203,
      "lr": 9.973159509202454e-06,
      "objective/entropy": 17.35771942138672,
      "objective/kl": 10.873266220092773,
      "objective/non_score_reward": -0.5436632633209229,
      "objective/rlhf_reward": -0.44131985406080876,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 98.38662719726562,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6953125,
      "step": 42,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9995697736740112
    },
    {
      "episode": 704,
      "epoch": 0.004218044121700159,
      "loss/policy_avg": 0.32498252391815186,
      "lr": 9.972520449897751e-06,
      "objective/entropy": 174.98866271972656,
      "objective/kl": 11.279447555541992,
      "objective/non_score_reward": -0.5639723539352417,
      "objective/rlhf_reward": -0.7749369321421384,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 62.73210144042969,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.552734375,
      "step": 43,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.0007286071777344
    },
    {
      "episode": 720,
      "epoch": 0.004313908760829708,
      "loss/policy_avg": 0.3995896577835083,
      "lr": 9.971881390593048e-06,
      "objective/entropy": 36.609832763671875,
      "objective/kl": 19.769756317138672,
      "objective/non_score_reward": -0.9884878993034363,
      "objective/rlhf_reward": -2.1291227295723667,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 164.33892822265625,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.708984375,
      "step": 44,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9945415258407593
    },
    {
      "episode": 736,
      "epoch": 0.004409773399959257,
      "loss/policy_avg": 0.17710548639297485,
      "lr": 9.971242331288345e-06,
      "objective/entropy": 93.23808288574219,
      "objective/kl": 16.88797378540039,
      "objective/non_score_reward": -0.8443987965583801,
      "objective/rlhf_reward": -1.7157356492882831,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 54.64923858642578,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.779296875,
      "step": 45,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9981857538223267
    },
    {
      "episode": 752,
      "epoch": 0.004505638039088807,
      "loss/policy_avg": 0.32767364382743835,
      "lr": 9.97060327198364e-06,
      "objective/entropy": 202.11843872070312,
      "objective/kl": 14.050471305847168,
      "objective/non_score_reward": -0.7025235295295715,
      "objective/rlhf_reward": -1.484581295281572,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 76.14016723632812,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7421875,
      "step": 46,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9997007846832275
    },
    {
      "episode": 768,
      "epoch": 0.004601502678218356,
      "loss/policy_avg": 0.08174459636211395,
      "lr": 9.969964212678937e-06,
      "objective/entropy": 54.37752151489258,
      "objective/kl": 15.1139497756958,
      "objective/non_score_reward": -0.75569748878479,
      "objective/rlhf_reward": -1.6635400888666343,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 83.4612045288086,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4296875,
      "step": 47,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9972434043884277
    },
    {
      "episode": 784,
      "epoch": 0.004697367317347905,
      "loss/policy_avg": 0.03365965187549591,
      "lr": 9.969325153374234e-06,
      "objective/entropy": 85.39935302734375,
      "objective/kl": 13.452342987060547,
      "objective/non_score_reward": -0.6726170778274536,
      "objective/rlhf_reward": -0.74305723138326,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 61.629390716552734,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.572265625,
      "step": 48,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9998470544815063
    },
    {
      "episode": 800,
      "epoch": 0.004793231956477454,
      "loss/policy_avg": 0.009335246868431568,
      "lr": 9.968686094069531e-06,
      "objective/entropy": 288.22564697265625,
      "objective/kl": 19.127742767333984,
      "objective/non_score_reward": -0.9563871026039124,
      "objective/rlhf_reward": -0.9018295153391089,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 176.43731689453125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.892578125,
      "step": 49,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9936624765396118
    },
    {
      "episode": 816,
      "epoch": 0.004889096595607003,
      "loss/policy_avg": 0.13336139917373657,
      "lr": 9.968047034764828e-06,
      "objective/entropy": -38.686851501464844,
      "objective/kl": 18.06523895263672,
      "objective/non_score_reward": -0.9032620191574097,
      "objective/rlhf_reward": -2.1320952503041024,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 179.73486328125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.65625,
      "step": 50,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.996016263961792
    },
    {
      "episode": 832,
      "epoch": 0.004984961234736552,
      "loss/policy_avg": 0.09758515655994415,
      "lr": 9.967407975460123e-06,
      "objective/entropy": -32.55284881591797,
      "objective/kl": 10.72513198852539,
      "objective/non_score_reward": -0.5362565517425537,
      "objective/rlhf_reward": -0.721194286544887,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 44.48727798461914,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.736328125,
      "step": 51,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9976041316986084
    },
    {
      "episode": 848,
      "epoch": 0.005080825873866101,
      "loss/policy_avg": 0.5202991366386414,
      "lr": 9.96676891615542e-06,
      "objective/entropy": 45.2802734375,
      "objective/kl": 16.129152297973633,
      "objective/non_score_reward": -0.8064576387405396,
      "objective/rlhf_reward": -1.2784193260239918,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 124.33740234375,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.623046875,
      "step": 52,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9978928565979004
    },
    {
      "episode": 864,
      "epoch": 0.00517669051299565,
      "loss/policy_avg": 0.28677505254745483,
      "lr": 9.966129856850717e-06,
      "objective/entropy": -76.81179809570312,
      "objective/kl": 15.223251342773438,
      "objective/non_score_reward": -0.761162519454956,
      "objective/rlhf_reward": -1.5288782207094989,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 69.77767944335938,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.7890625,
      "step": 53,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9999229907989502
    },
    {
      "episode": 880,
      "epoch": 0.0052725551521251995,
      "loss/policy_avg": 0.20859162509441376,
      "lr": 9.965490797546014e-06,
      "objective/entropy": -21.344478607177734,
      "objective/kl": 10.70494556427002,
      "objective/non_score_reward": -0.535247266292572,
      "objective/rlhf_reward": -0.7623869264997064,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 98.75808715820312,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.56640625,
      "step": 54,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9975996017456055
    },
    {
      "episode": 896,
      "epoch": 0.0053684197912547485,
      "loss/policy_avg": 1.2579694986343384,
      "lr": 9.96485173824131e-06,
      "objective/entropy": 164.7299346923828,
      "objective/kl": 18.096805572509766,
      "objective/non_score_reward": -0.9048402309417725,
      "objective/rlhf_reward": -2.0152409709134873,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 95.78445434570312,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.736328125,
      "step": 55,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9966285228729248
    },
    {
      "episode": 912,
      "epoch": 0.0054642844303842975,
      "loss/policy_avg": 0.3564913868904114,
      "lr": 9.964212678936606e-06,
      "objective/entropy": 85.46858215332031,
      "objective/kl": 17.930484771728516,
      "objective/non_score_reward": -0.89652419090271,
      "objective/rlhf_reward": -1.4633905313172677,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 79.41477966308594,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4091796875,
      "step": 56,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9984946250915527
    },
    {
      "episode": 928,
      "epoch": 0.005560149069513846,
      "loss/policy_avg": 0.03960660099983215,
      "lr": 9.963573619631903e-06,
      "objective/entropy": 205.954833984375,
      "objective/kl": 17.15917205810547,
      "objective/non_score_reward": -0.8579585552215576,
      "objective/rlhf_reward": -1.3091281972089148,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 23.591196060180664,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.791015625,
      "step": 57,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.997645378112793
    },
    {
      "episode": 944,
      "epoch": 0.005656013708643395,
      "loss/policy_avg": -0.00983378104865551,
      "lr": 9.9629345603272e-06,
      "objective/entropy": -1.1022186279296875,
      "objective/kl": 16.26142692565918,
      "objective/non_score_reward": -0.8130713105201721,
      "objective/rlhf_reward": 1.1477148175239567,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 81.65092468261719,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.552734375,
      "step": 58,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.99957275390625
    },
    {
      "episode": 960,
      "epoch": 0.005751878347772944,
      "loss/policy_avg": 0.32060182094573975,
      "lr": 9.962295501022495e-06,
      "objective/entropy": 48.09014892578125,
      "objective/kl": 7.438636302947998,
      "objective/non_score_reward": -0.3719318211078644,
      "objective/rlhf_reward": 0.6349789739391469,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 7.77626895904541,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.822265625,
      "step": 59,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.008031129837036
    },
    {
      "episode": 976,
      "epoch": 0.005847742986902493,
      "loss/policy_avg": 0.2516993582248688,
      "lr": 9.961656441717792e-06,
      "objective/entropy": -46.64883804321289,
      "objective/kl": 19.601835250854492,
      "objective/non_score_reward": -0.9800918102264404,
      "objective/rlhf_reward": -2.594854134946985,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 181.5974578857422,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.609375,
      "step": 60,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9988071918487549
    },
    {
      "episode": 992,
      "epoch": 0.005943607626032043,
      "loss/policy_avg": 0.1109720841050148,
      "lr": 9.961017382413088e-06,
      "objective/entropy": 97.6422348022461,
      "objective/kl": 13.844486236572266,
      "objective/non_score_reward": -0.692224383354187,
      "objective/rlhf_reward": -1.2126380791335847,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 96.34603118896484,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.583984375,
      "step": 61,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9974918365478516
    },
    {
      "episode": 1008,
      "epoch": 0.006039472265161592,
      "loss/policy_avg": -0.05115126073360443,
      "lr": 9.960378323108385e-06,
      "objective/entropy": 34.42061996459961,
      "objective/kl": 14.079090118408203,
      "objective/non_score_reward": -0.7039545774459839,
      "objective/rlhf_reward": -1.4565682944997977,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 49.87873840332031,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.677734375,
      "step": 62,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9982357025146484
    },
    {
      "episode": 1024,
      "epoch": 0.006135336904291141,
      "loss/policy_avg": 0.22280101478099823,
      "lr": 9.959739263803682e-06,
      "objective/entropy": -24.89067840576172,
      "objective/kl": 19.501176834106445,
      "objective/non_score_reward": -0.9750589728355408,
      "objective/rlhf_reward": -2.4496376319841,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 243.47512817382812,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.888671875,
      "step": 63,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.999916911125183
    },
    {
      "episode": 1040,
      "epoch": 0.00623120154342069,
      "loss/policy_avg": 0.36840492486953735,
      "lr": 9.959100204498979e-06,
      "objective/entropy": 134.6929931640625,
      "objective/kl": 22.332670211791992,
      "objective/non_score_reward": -1.1166335344314575,
      "objective/rlhf_reward": -2.641705389293741,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 136.65045166015625,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.65234375,
      "step": 64,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9981276988983154
    },
    {
      "episode": 1056,
      "epoch": 0.006327066182550239,
      "loss/policy_avg": 0.09098342061042786,
      "lr": 9.958461145194274e-06,
      "objective/entropy": -26.864063262939453,
      "objective/kl": 13.052759170532227,
      "objective/non_score_reward": -0.6526379585266113,
      "objective/rlhf_reward": -0.7857228770580997,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 62.885929107666016,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.603515625,
      "step": 65,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.997183084487915
    },
    {
      "episode": 1072,
      "epoch": 0.006422930821679788,
      "loss/policy_avg": 0.27086368203163147,
      "lr": 9.957822085889571e-06,
      "objective/entropy": -58.01667404174805,
      "objective/kl": 16.48623275756836,
      "objective/non_score_reward": -0.8243115544319153,
      "objective/rlhf_reward": -1.635386770189391,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 153.92050170898438,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.583984375,
      "step": 66,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0005505084991455
    },
    {
      "episode": 1088,
      "epoch": 0.006518795460809337,
      "loss/policy_avg": 1.2388324737548828,
      "lr": 9.957183026584868e-06,
      "objective/entropy": 99.91399383544922,
      "objective/kl": 21.524110794067383,
      "objective/non_score_reward": -1.0762056112289429,
      "objective/rlhf_reward": -2.6429626993542774,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 170.69760131835938,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.64453125,
      "step": 67,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9946773052215576
    },
    {
      "episode": 1104,
      "epoch": 0.006614660099938887,
      "loss/policy_avg": 0.330521821975708,
      "lr": 9.956543967280165e-06,
      "objective/entropy": -76.99481201171875,
      "objective/kl": 15.58948802947998,
      "objective/non_score_reward": -0.7794743776321411,
      "objective/rlhf_reward": -1.7178976856172086,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 218.45574951171875,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.64453125,
      "step": 68,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9997715950012207
    },
    {
      "episode": 1120,
      "epoch": 0.006710524739068436,
      "loss/policy_avg": 0.11920663714408875,
      "lr": 9.955904907975462e-06,
      "objective/entropy": 70.55160522460938,
      "objective/kl": 20.134777069091797,
      "objective/non_score_reward": -1.0067389011383057,
      "objective/rlhf_reward": -2.6853197722727353,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 62.195674896240234,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.34765625,
      "step": 69,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.001072406768799
    },
    {
      "episode": 1136,
      "epoch": 0.006806389378197985,
      "loss/policy_avg": -0.17695794999599457,
      "lr": 9.955265848670757e-06,
      "objective/entropy": 101.99272918701172,
      "objective/kl": 12.69788932800293,
      "objective/non_score_reward": -0.6348943710327148,
      "objective/rlhf_reward": -2.539577692747116,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 64.835693359375,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.44140625,
      "step": 70,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0115315914154053
    },
    {
      "episode": 1152,
      "epoch": 0.0069022540173275335,
      "loss/policy_avg": 0.35137245059013367,
      "lr": 9.954626789366054e-06,
      "objective/entropy": 79.80499267578125,
      "objective/kl": 21.120101928710938,
      "objective/non_score_reward": -1.0560050010681152,
      "objective/rlhf_reward": -2.1013141296067577,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 124.16864776611328,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.68359375,
      "step": 71,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.998917818069458
    },
    {
      "episode": 1168,
      "epoch": 0.0069981186564570825,
      "loss/policy_avg": 0.07422849535942078,
      "lr": 9.95398773006135e-06,
      "objective/entropy": 9.376724243164062,
      "objective/kl": 15.093628883361816,
      "objective/non_score_reward": -0.7546814680099487,
      "objective/rlhf_reward": -1.6594760653719138,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 47.567962646484375,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.65234375,
      "step": 72,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9958832263946533
    },
    {
      "episode": 1184,
      "epoch": 0.0070939832955866314,
      "loss/policy_avg": 0.11969298124313354,
      "lr": 9.953348670756648e-06,
      "objective/entropy": 133.57423400878906,
      "objective/kl": 20.2343807220459,
      "objective/non_score_reward": -1.0117191076278687,
      "objective/rlhf_reward": -1.1231571778070655,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 93.79672241210938,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.423828125,
      "step": 73,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.0005669593811035
    },
    {
      "episode": 1200,
      "epoch": 0.00718984793471618,
      "loss/policy_avg": 0.2395152747631073,
      "lr": 9.952709611451944e-06,
      "objective/entropy": 31.68697166442871,
      "objective/kl": 20.96116828918457,
      "objective/non_score_reward": -1.0480585098266602,
      "objective/rlhf_reward": -2.711281481202006,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 194.83474731445312,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.669921875,
      "step": 74,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9953577518463135
    },
    {
      "episode": 1216,
      "epoch": 0.00728571257384573,
      "loss/policy_avg": 0.27856501936912537,
      "lr": 9.952070552147241e-06,
      "objective/entropy": 119.42091369628906,
      "objective/kl": 11.30095100402832,
      "objective/non_score_reward": -0.5650476217269897,
      "objective/rlhf_reward": -0.9185547738367612,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 59.14590835571289,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.75,
      "step": 75,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9953703880310059
    },
    {
      "episode": 1232,
      "epoch": 0.007381577212975279,
      "loss/policy_avg": 0.21030786633491516,
      "lr": 9.951431492842536e-06,
      "objective/entropy": 7.310768127441406,
      "objective/kl": 6.645857810974121,
      "objective/non_score_reward": -0.3322928845882416,
      "objective/rlhf_reward": 0.04943063011993787,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 14.611559867858887,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.591796875,
      "step": 76,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9996391534805298
    },
    {
      "episode": 1248,
      "epoch": 0.007477441852104828,
      "loss/policy_avg": 0.4117072820663452,
      "lr": 9.950792433537833e-06,
      "objective/entropy": -109.53082275390625,
      "objective/kl": 11.825650215148926,
      "objective/non_score_reward": -0.5912825465202332,
      "objective/rlhf_reward": 0.03486987352371207,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 19.0810604095459,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.6171875,
      "step": 77,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9981213808059692
    },
    {
      "episode": 1264,
      "epoch": 0.007573306491234377,
      "loss/policy_avg": 0.2597622275352478,
      "lr": 9.950153374233129e-06,
      "objective/entropy": -29.7529296875,
      "objective/kl": 18.43012809753418,
      "objective/non_score_reward": -0.9215063452720642,
      "objective/rlhf_reward": -2.2860254704952236,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 267.2847900390625,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.70703125,
      "step": 78,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.997544765472412
    },
    {
      "episode": 1280,
      "epoch": 0.007669171130363926,
      "loss/policy_avg": 0.2407466471195221,
      "lr": 9.949514314928425e-06,
      "objective/entropy": 14.07373046875,
      "objective/kl": 20.781753540039062,
      "objective/non_score_reward": -1.0390876531600952,
      "objective/rlhf_reward": -1.2326316579591956,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 147.4822235107422,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.724609375,
      "step": 79,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9987099170684814
    },
    {
      "episode": 1296,
      "epoch": 0.007765035769493475,
      "loss/policy_avg": 0.17344285547733307,
      "lr": 9.948875255623722e-06,
      "objective/entropy": 112.44259643554688,
      "objective/kl": 10.0985746383667,
      "objective/non_score_reward": -0.504928708076477,
      "objective/rlhf_reward": 0.38028510808944693,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 4.8866167068481445,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.443359375,
      "step": 80,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0143842697143555
    },
    {
      "episode": 1312,
      "epoch": 0.007860900408623025,
      "loss/policy_avg": 0.14816004037857056,
      "lr": 9.94823619631902e-06,
      "objective/entropy": 67.11033630371094,
      "objective/kl": 17.487518310546875,
      "objective/non_score_reward": -0.8743758797645569,
      "objective/rlhf_reward": -2.1558679251963193,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 18.69343376159668,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4619140625,
      "step": 81,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.998113751411438
    },
    {
      "episode": 1328,
      "epoch": 0.007956765047752574,
      "loss/policy_avg": 0.2536642849445343,
      "lr": 9.947597137014316e-06,
      "objective/entropy": -71.85224914550781,
      "objective/kl": 11.223343849182129,
      "objective/non_score_reward": -0.5611672401428223,
      "objective/rlhf_reward": -0.7637163875654935,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 37.78028869628906,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.48828125,
      "step": 82,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0003702640533447
    },
    {
      "episode": 1344,
      "epoch": 0.008052629686882123,
      "loss/policy_avg": 0.3479039669036865,
      "lr": 9.946958077709611e-06,
      "objective/entropy": 146.41241455078125,
      "objective/kl": 20.458145141601562,
      "objective/non_score_reward": -1.0229072570800781,
      "objective/rlhf_reward": -2.732379042838497,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 64.28889465332031,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.705078125,
      "step": 83,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9976742267608643
    },
    {
      "episode": 1360,
      "epoch": 0.008148494326011672,
      "loss/policy_avg": 0.10525624454021454,
      "lr": 9.946319018404908e-06,
      "objective/entropy": -43.42662048339844,
      "objective/kl": 13.858359336853027,
      "objective/non_score_reward": -0.6929180026054382,
      "objective/rlhf_reward": -0.6489658228316642,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 61.37925720214844,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.48828125,
      "step": 84,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0012595653533936
    },
    {
      "episode": 1376,
      "epoch": 0.00824435896514122,
      "loss/policy_avg": 0.3409525156021118,
      "lr": 9.945679959100205e-06,
      "objective/entropy": 1.5508041381835938,
      "objective/kl": 19.05010223388672,
      "objective/non_score_reward": -0.9525051116943359,
      "objective/rlhf_reward": -2.205900583330708,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 97.6533203125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.63671875,
      "step": 85,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.000422477722168
    },
    {
      "episode": 1392,
      "epoch": 0.00834022360427077,
      "loss/policy_avg": 0.3110717535018921,
      "lr": 9.945040899795502e-06,
      "objective/entropy": 215.75965881347656,
      "objective/kl": 18.800819396972656,
      "objective/non_score_reward": -0.9400409460067749,
      "objective/rlhf_reward": -2.156043860975819,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 84.93620300292969,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.669921875,
      "step": 86,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9962902069091797
    },
    {
      "episode": 1408,
      "epoch": 0.008436088243400319,
      "loss/policy_avg": 0.02868543565273285,
      "lr": 9.944401840490799e-06,
      "objective/entropy": 154.10025024414062,
      "objective/kl": 13.492873191833496,
      "objective/non_score_reward": -0.6746436357498169,
      "objective/rlhf_reward": -0.9652413214246431,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 42.483882904052734,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.44921875,
      "step": 87,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9983662366867065
    },
    {
      "episode": 1424,
      "epoch": 0.008531952882529868,
      "loss/policy_avg": 0.07607420533895493,
      "lr": 9.943762781186096e-06,
      "objective/entropy": 202.40365600585938,
      "objective/kl": 13.719297409057617,
      "objective/non_score_reward": -0.685964822769165,
      "objective/rlhf_reward": 1.6561407089233402,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 20.57819175720215,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.728515625,
      "step": 88,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.999366283416748
    },
    {
      "episode": 1440,
      "epoch": 0.008627817521659416,
      "loss/policy_avg": 0.16665664315223694,
      "lr": 9.94312372188139e-06,
      "objective/entropy": -100.20193481445312,
      "objective/kl": 15.216776847839355,
      "objective/non_score_reward": -0.7608388662338257,
      "objective/rlhf_reward": -1.4392355120817002,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 85.36731719970703,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.626953125,
      "step": 89,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9990196228027344
    },
    {
      "episode": 1456,
      "epoch": 0.008723682160788965,
      "loss/policy_avg": 0.19817781448364258,
      "lr": 9.942484662576688e-06,
      "objective/entropy": -0.7409725189208984,
      "objective/kl": 10.389724731445312,
      "objective/non_score_reward": -0.5194862484931946,
      "objective/rlhf_reward": 2.3220549762248996,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 12.642692565917969,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.52734375,
      "step": 90,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9989277124404907
    },
    {
      "episode": 1472,
      "epoch": 0.008819546799918514,
      "loss/policy_avg": 0.2365586757659912,
      "lr": 9.941845603271985e-06,
      "objective/entropy": 152.64306640625,
      "objective/kl": 21.58309555053711,
      "objective/non_score_reward": -1.0791547298431396,
      "objective/rlhf_reward": -2.9573691723093223,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 87.72661590576172,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.771484375,
      "step": 91,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9999784231185913
    },
    {
      "episode": 1488,
      "epoch": 0.008915411439048063,
      "loss/policy_avg": 0.059907689690589905,
      "lr": 9.941206543967281e-06,
      "objective/entropy": 89.6580810546875,
      "objective/kl": 16.996726989746094,
      "objective/non_score_reward": -0.8498364686965942,
      "objective/rlhf_reward": -1.9755135669308581,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 72.40145874023438,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.861328125,
      "step": 92,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0003039836883545
    },
    {
      "episode": 1504,
      "epoch": 0.009011276078177614,
      "loss/policy_avg": 0.14265713095664978,
      "lr": 9.940567484662578e-06,
      "objective/entropy": -33.708492279052734,
      "objective/kl": 15.94516372680664,
      "objective/non_score_reward": -0.797258198261261,
      "objective/rlhf_reward": -0.2653137638580527,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 78.95989990234375,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.53125,
      "step": 93,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.997810959815979
    },
    {
      "episode": 1520,
      "epoch": 0.009107140717307163,
      "loss/policy_avg": -0.018713245168328285,
      "lr": 9.939928425357874e-06,
      "objective/entropy": -3.091245651245117,
      "objective/kl": 14.482427597045898,
      "objective/non_score_reward": -0.7241213917732239,
      "objective/rlhf_reward": -1.2346261046534641,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 56.76847839355469,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.501953125,
      "step": 94,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9993091821670532
    },
    {
      "episode": 1536,
      "epoch": 0.009203005356436712,
      "loss/policy_avg": -0.0069353943690657616,
      "lr": 9.93928936605317e-06,
      "objective/entropy": 95.46006774902344,
      "objective/kl": 20.928672790527344,
      "objective/non_score_reward": -1.046433687210083,
      "objective/rlhf_reward": -2.360906060012888,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 103.58160400390625,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.58203125,
      "step": 95,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9974098205566406
    },
    {
      "episode": 1552,
      "epoch": 0.009298869995566261,
      "loss/policy_avg": 0.0523187518119812,
      "lr": 9.938650306748467e-06,
      "objective/entropy": 16.342994689941406,
      "objective/kl": 20.205509185791016,
      "objective/non_score_reward": -1.0102753639221191,
      "objective/rlhf_reward": 0.35889836549758947,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 84.55277252197266,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4697265625,
      "step": 96,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.000453233718872
    },
    {
      "episode": 1568,
      "epoch": 0.00939473463469581,
      "loss/policy_avg": 0.18428044021129608,
      "lr": 9.938011247443764e-06,
      "objective/entropy": -31.386062622070312,
      "objective/kl": 19.641075134277344,
      "objective/non_score_reward": -0.9820537567138672,
      "objective/rlhf_reward": -1.8055088541665412,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 92.56884002685547,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.59765625,
      "step": 97,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0001230239868164
    },
    {
      "episode": 1584,
      "epoch": 0.009490599273825359,
      "loss/policy_avg": -0.11768925935029984,
      "lr": 9.937372188139061e-06,
      "objective/entropy": -29.0854434967041,
      "objective/kl": 16.647226333618164,
      "objective/non_score_reward": -0.8323614001274109,
      "objective/rlhf_reward": -1.9701957342371177,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 1.0866272449493408,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.541015625,
      "step": 98,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0018460750579834
    },
    {
      "episode": 1600,
      "epoch": 0.009586463912954908,
      "loss/policy_avg": 0.06727765500545502,
      "lr": 9.936733128834358e-06,
      "objective/entropy": 96.53413391113281,
      "objective/kl": 21.015684127807617,
      "objective/non_score_reward": -1.0507843494415283,
      "objective/rlhf_reward": -2.8031371593475343,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 36.56340026855469,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.646484375,
      "step": 99,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9949061870574951
    },
    {
      "episode": 1616,
      "epoch": 0.009682328552084457,
      "loss/policy_avg": 0.28386813402175903,
      "lr": 9.936094069529653e-06,
      "objective/entropy": 33.901954650878906,
      "objective/kl": 19.533782958984375,
      "objective/non_score_reward": -0.9766892194747925,
      "objective/rlhf_reward": -2.425804230387568,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 162.0339813232422,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5859375,
      "step": 100,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9985501766204834
    },
    {
      "episode": 1632,
      "epoch": 0.009778193191214006,
      "loss/policy_avg": 0.11220409721136093,
      "lr": 9.93545501022495e-06,
      "objective/entropy": -3.93096923828125,
      "objective/kl": 22.981700897216797,
      "objective/non_score_reward": -1.1490850448608398,
      "objective/rlhf_reward": -3.1725080504017744,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 46.0514030456543,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6328125,
      "step": 101,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0070252418518066
    },
    {
      "episode": 1648,
      "epoch": 0.009874057830343555,
      "loss/policy_avg": 0.20420242846012115,
      "lr": 9.934815950920245e-06,
      "objective/entropy": 198.98751831054688,
      "objective/kl": 17.92270278930664,
      "objective/non_score_reward": -0.8961352109909058,
      "objective/rlhf_reward": -1.759712155136179,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 55.74137878417969,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.65234375,
      "step": 102,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9980387687683105
    },
    {
      "episode": 1664,
      "epoch": 0.009969922469473104,
      "loss/policy_avg": 0.27041423320770264,
      "lr": 9.934176891615542e-06,
      "objective/entropy": 1.5637626647949219,
      "objective/kl": 12.633028030395508,
      "objective/non_score_reward": -0.6316514015197754,
      "objective/rlhf_reward": -0.7017769768563022,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 13.92137622833252,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4208984375,
      "step": 103,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9987752437591553
    },
    {
      "episode": 1680,
      "epoch": 0.010065787108602653,
      "loss/policy_avg": 0.318324476480484,
      "lr": 9.933537832310839e-06,
      "objective/entropy": 218.76858520507812,
      "objective/kl": 21.40100860595703,
      "objective/non_score_reward": -1.0700504779815674,
      "objective/rlhf_reward": -2.9385662584597165,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 90.99249267578125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.677734375,
      "step": 104,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.998801827430725
    },
    {
      "episode": 1696,
      "epoch": 0.010161651747732202,
      "loss/policy_avg": 0.3075984716415405,
      "lr": 9.932898773006136e-06,
      "objective/entropy": -56.81090545654297,
      "objective/kl": 10.457717895507812,
      "objective/non_score_reward": -0.5228859186172485,
      "objective/rlhf_reward": -0.7129414687431871,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 48.63943862915039,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.658203125,
      "step": 105,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.995776653289795
    },
    {
      "episode": 1712,
      "epoch": 0.01025751638686175,
      "loss/policy_avg": 0.5551585555076599,
      "lr": 9.932259713701433e-06,
      "objective/entropy": -48.12900924682617,
      "objective/kl": 21.915470123291016,
      "objective/non_score_reward": -1.0957735776901245,
      "objective/rlhf_reward": -1.459375207067701,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 33.369083404541016,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.71484375,
      "step": 106,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.995157241821289
    },
    {
      "episode": 1728,
      "epoch": 0.0103533810259913,
      "loss/policy_avg": 0.252463161945343,
      "lr": 9.931620654396728e-06,
      "objective/entropy": -69.64755249023438,
      "objective/kl": 15.248108863830566,
      "objective/non_score_reward": -0.7624054551124573,
      "objective/rlhf_reward": -1.707986166983276,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 59.05755615234375,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7578125,
      "step": 107,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9963257312774658
    },
    {
      "episode": 1744,
      "epoch": 0.01044924566512085,
      "loss/policy_avg": 0.13919854164123535,
      "lr": 9.930981595092025e-06,
      "objective/entropy": -133.55258178710938,
      "objective/kl": 17.2213134765625,
      "objective/non_score_reward": -0.8610656261444092,
      "objective/rlhf_reward": -2.0850126979097556,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 32.41887664794922,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5234375,
      "step": 108,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9992578029632568
    },
    {
      "episode": 1760,
      "epoch": 0.010545110304250399,
      "loss/policy_avg": 0.5300755500793457,
      "lr": 9.930342535787322e-06,
      "objective/entropy": -9.471179962158203,
      "objective/kl": 18.607471466064453,
      "objective/non_score_reward": -0.9303736090660095,
      "objective/rlhf_reward": -2.3214945554733273,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 31.75185203552246,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.654296875,
      "step": 109,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9994070529937744
    },
    {
      "episode": 1776,
      "epoch": 0.010640974943379948,
      "loss/policy_avg": 0.17107412219047546,
      "lr": 9.929703476482619e-06,
      "objective/entropy": 72.44110107421875,
      "objective/kl": 16.862125396728516,
      "objective/non_score_reward": -0.8431062698364258,
      "objective/rlhf_reward": -3.372425138950348,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 66.22834777832031,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.70703125,
      "step": 110,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.995293378829956
    },
    {
      "episode": 1792,
      "epoch": 0.010736839582509497,
      "loss/policy_avg": -0.11443672329187393,
      "lr": 9.929064417177915e-06,
      "objective/entropy": 80.82670593261719,
      "objective/kl": 18.79993438720703,
      "objective/non_score_reward": -0.9399967789649963,
      "objective/rlhf_reward": -2.336154927213756,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 31.270248413085938,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.5625,
      "step": 111,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.007622241973877
    },
    {
      "episode": 1808,
      "epoch": 0.010832704221639046,
      "loss/policy_avg": 0.0878123939037323,
      "lr": 9.928425357873212e-06,
      "objective/entropy": -118.92440795898438,
      "objective/kl": 17.83495330810547,
      "objective/non_score_reward": -0.8917477130889893,
      "objective/rlhf_reward": -2.2253551392847593,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 20.88257598876953,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7421875,
      "step": 112,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.996976613998413
    },
    {
      "episode": 1824,
      "epoch": 0.010928568860768595,
      "loss/policy_avg": 0.18364591896533966,
      "lr": 9.927786298568507e-06,
      "objective/entropy": 8.144821166992188,
      "objective/kl": 14.821235656738281,
      "objective/non_score_reward": -0.741061806678772,
      "objective/rlhf_reward": -1.2309138337771097,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 17.778968811035156,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62890625,
      "step": 113,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.000706911087036
    },
    {
      "episode": 1840,
      "epoch": 0.011024433499898144,
      "loss/policy_avg": 0.06979192793369293,
      "lr": 9.927147239263804e-06,
      "objective/entropy": -2.9724502563476562,
      "objective/kl": 17.076000213623047,
      "objective/non_score_reward": -0.8538000583648682,
      "objective/rlhf_reward": -1.8994284508549533,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 46.98078918457031,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.798828125,
      "step": 114,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9999396800994873
    },
    {
      "episode": 1856,
      "epoch": 0.011120298139027693,
      "loss/policy_avg": 0.27465301752090454,
      "lr": 9.926508179959101e-06,
      "objective/entropy": 40.056610107421875,
      "objective/kl": 22.515907287597656,
      "objective/non_score_reward": -1.1257953643798828,
      "objective/rlhf_reward": -2.8413221291905506,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 81.93817138671875,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.57421875,
      "step": 115,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0007479190826416
    },
    {
      "episode": 1872,
      "epoch": 0.011216162778157242,
      "loss/policy_avg": 0.3945024013519287,
      "lr": 9.925869120654398e-06,
      "objective/entropy": 69.15873718261719,
      "objective/kl": 21.74050521850586,
      "objective/non_score_reward": -1.0870254039764404,
      "objective/rlhf_reward": -3.0225888824760148,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 38.46895980834961,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.59765625,
      "step": 116,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0014419555664062
    },
    {
      "episode": 1888,
      "epoch": 0.01131202741728679,
      "loss/policy_avg": 0.5689772367477417,
      "lr": 9.925230061349695e-06,
      "objective/entropy": 144.26678466796875,
      "objective/kl": 14.530990600585938,
      "objective/non_score_reward": -0.726549506187439,
      "objective/rlhf_reward": -1.1728648702303568,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 1.715579628944397,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.8203125,
      "step": 117,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0058815479278564
    },
    {
      "episode": 1904,
      "epoch": 0.01140789205641634,
      "loss/policy_avg": -0.025625256821513176,
      "lr": 9.92459100204499e-06,
      "objective/entropy": -91.6683120727539,
      "objective/kl": 16.61312484741211,
      "objective/non_score_reward": -0.8306561708450317,
      "objective/rlhf_reward": -1.944022663918835,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 18.064186096191406,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.4990234375,
      "step": 118,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999868392944336
    },
    {
      "episode": 1920,
      "epoch": 0.011503756695545889,
      "loss/policy_avg": 0.4135175943374634,
      "lr": 9.923951942740287e-06,
      "objective/entropy": 145.33905029296875,
      "objective/kl": 18.559207916259766,
      "objective/non_score_reward": -0.9279603958129883,
      "objective/rlhf_reward": -1.5891353509583808,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 19.033662796020508,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.66796875,
      "step": 119,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9981472492218018
    },
    {
      "episode": 1936,
      "epoch": 0.011599621334675438,
      "loss/policy_avg": 0.3322446942329407,
      "lr": 9.923312883435584e-06,
      "objective/entropy": 109.6761474609375,
      "objective/kl": 18.231651306152344,
      "objective/non_score_reward": -0.9115825891494751,
      "objective/rlhf_reward": -1.2463304907083512,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 108.51126098632812,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.65625,
      "step": 120,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.996593952178955
    },
    {
      "episode": 1952,
      "epoch": 0.011695485973804987,
      "loss/policy_avg": 0.22522342205047607,
      "lr": 9.92267382413088e-06,
      "objective/entropy": 95.46246337890625,
      "objective/kl": 16.838998794555664,
      "objective/non_score_reward": -0.841949999332428,
      "objective/rlhf_reward": -1.8520282743298375,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 14.038084983825684,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.8046875,
      "step": 121,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.997774362564087
    },
    {
      "episode": 1968,
      "epoch": 0.011791350612934537,
      "loss/policy_avg": 0.18379229307174683,
      "lr": 9.922034764826178e-06,
      "objective/entropy": 138.12388610839844,
      "objective/kl": 25.93743324279785,
      "objective/non_score_reward": -1.2968716621398926,
      "objective/rlhf_reward": -3.828236812089367,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 26.206398010253906,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.734375,
      "step": 122,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0011606216430664
    },
    {
      "episode": 1984,
      "epoch": 0.011887215252064086,
      "loss/policy_avg": 0.31653979420661926,
      "lr": 9.921395705521473e-06,
      "objective/entropy": -44.61676788330078,
      "objective/kl": 21.166324615478516,
      "objective/non_score_reward": -1.0583162307739258,
      "objective/rlhf_reward": -2.9077520704566666,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 29.74887466430664,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.521484375,
      "step": 123,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9996273517608643
    },
    {
      "episode": 2000,
      "epoch": 0.011983079891193635,
      "loss/policy_avg": 0.1589316874742508,
      "lr": 9.92075664621677e-06,
      "objective/entropy": -77.4912109375,
      "objective/kl": 20.79126739501953,
      "objective/non_score_reward": -1.0395634174346924,
      "objective/rlhf_reward": -2.4249199191729227,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 133.58343505859375,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.66015625,
      "step": 124,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9961724281311035
    },
    {
      "episode": 2016,
      "epoch": 0.012078944530323184,
      "loss/policy_avg": 0.2586688995361328,
      "lr": 9.920117586912067e-06,
      "objective/entropy": 139.38818359375,
      "objective/kl": 21.455245971679688,
      "objective/non_score_reward": -1.072762370109558,
      "objective/rlhf_reward": -2.775277876647648,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 47.609947204589844,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.8125,
      "step": 125,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9975237846374512
    },
    {
      "episode": 2032,
      "epoch": 0.012174809169452733,
      "loss/policy_avg": 0.16066747903823853,
      "lr": 9.919478527607362e-06,
      "objective/entropy": 72.43231201171875,
      "objective/kl": 20.59688377380371,
      "objective/non_score_reward": -1.0298442840576172,
      "objective/rlhf_reward": 0.28062304258346593,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 75.74966430664062,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.529296875,
      "step": 126,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.998389720916748
    },
    {
      "episode": 2048,
      "epoch": 0.012270673808582282,
      "loss/policy_avg": 0.07932023704051971,
      "lr": 9.918839468302659e-06,
      "objective/entropy": -12.7745361328125,
      "objective/kl": 20.53061294555664,
      "objective/non_score_reward": -1.0265307426452637,
      "objective/rlhf_reward": -2.7275206232942164,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 19.110069274902344,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.55859375,
      "step": 127,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9984248876571655
    },
    {
      "episode": 2064,
      "epoch": 0.012366538447711831,
      "loss/policy_avg": 0.27331969141960144,
      "lr": 9.918200408997956e-06,
      "objective/entropy": 101.82013702392578,
      "objective/kl": 18.18286895751953,
      "objective/non_score_reward": -0.9091434478759766,
      "objective/rlhf_reward": -2.2579716230310023,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 6.703115463256836,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.556640625,
      "step": 128,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0009617805480957
    },
    {
      "episode": 2080,
      "epoch": 0.01246240308684138,
      "loss/policy_avg": 0.4916057586669922,
      "lr": 9.917561349693252e-06,
      "objective/entropy": 88.1321029663086,
      "objective/kl": 23.30657958984375,
      "objective/non_score_reward": -1.165329098701477,
      "objective/rlhf_reward": -3.3020663795217704,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 142.93795776367188,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.611328125,
      "step": 129,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9967398643493652
    },
    {
      "episode": 2096,
      "epoch": 0.012558267725970929,
      "loss/policy_avg": 0.16071423888206482,
      "lr": 9.91692229038855e-06,
      "objective/entropy": 136.1899871826172,
      "objective/kl": 15.380975723266602,
      "objective/non_score_reward": -0.769048810005188,
      "objective/rlhf_reward": -0.6761951804161073,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 28.551767349243164,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.56640625,
      "step": 130,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.03311824798584
    },
    {
      "episode": 2112,
      "epoch": 0.012654132365100478,
      "loss/policy_avg": 0.0021135974675416946,
      "lr": 9.916283231083844e-06,
      "objective/entropy": -71.15084838867188,
      "objective/kl": 18.961715698242188,
      "objective/non_score_reward": -0.9480857849121094,
      "objective/rlhf_reward": -2.1304838709241016,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 1.844127893447876,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4833984375,
      "step": 131,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0009913444519043
    },
    {
      "episode": 2128,
      "epoch": 0.012749997004230027,
      "loss/policy_avg": 0.042635850608348846,
      "lr": 9.915644171779141e-06,
      "objective/entropy": 20.673603057861328,
      "objective/kl": 15.986173629760742,
      "objective/non_score_reward": -0.7993086576461792,
      "objective/rlhf_reward": -1.8555989473158414,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 36.049034118652344,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.67578125,
      "step": 132,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.998380422592163
    },
    {
      "episode": 2144,
      "epoch": 0.012845861643359576,
      "loss/policy_avg": 0.46513473987579346,
      "lr": 9.915005112474438e-06,
      "objective/entropy": 5.5274505615234375,
      "objective/kl": 19.590290069580078,
      "objective/non_score_reward": -0.979514479637146,
      "objective/rlhf_reward": -2.5394558692849696,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 12.074180603027344,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.642578125,
      "step": 133,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0011277198791504
    },
    {
      "episode": 2160,
      "epoch": 0.012941726282489125,
      "loss/policy_avg": 0.245748370885849,
      "lr": 9.914366053169735e-06,
      "objective/entropy": 65.60797119140625,
      "objective/kl": 19.637710571289062,
      "objective/non_score_reward": -0.9818854928016663,
      "objective/rlhf_reward": -1.980130786971982,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 50.17578125,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.791015625,
      "step": 134,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9983665943145752
    },
    {
      "episode": 2176,
      "epoch": 0.013037590921618674,
      "loss/policy_avg": 0.02180427499115467,
      "lr": 9.913726993865032e-06,
      "objective/entropy": 0.8936500549316406,
      "objective/kl": 24.33076286315918,
      "objective/non_score_reward": -1.2165381908416748,
      "objective/rlhf_reward": -3.524517109900146,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 69.30375671386719,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5009765625,
      "step": 135,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.99745774269104
    },
    {
      "episode": 2192,
      "epoch": 0.013133455560748224,
      "loss/policy_avg": 0.36717042326927185,
      "lr": 9.913087934560329e-06,
      "objective/entropy": 83.415283203125,
      "objective/kl": 21.930896759033203,
      "objective/non_score_reward": -1.0965447425842285,
      "objective/rlhf_reward": -1.4624603136789527,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 79.15277862548828,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.546875,
      "step": 136,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.998199701309204
    },
    {
      "episode": 2208,
      "epoch": 0.013229320199877773,
      "loss/policy_avg": 0.2460360825061798,
      "lr": 9.912448875255624e-06,
      "objective/entropy": 137.11976623535156,
      "objective/kl": 21.218502044677734,
      "objective/non_score_reward": -1.060925006866455,
      "objective/rlhf_reward": -2.8198681666451373,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 67.851806640625,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.666015625,
      "step": 137,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9969704151153564
    },
    {
      "episode": 2224,
      "epoch": 0.013325184839007322,
      "loss/policy_avg": 0.21244561672210693,
      "lr": 9.911809815950921e-06,
      "objective/entropy": 175.0180206298828,
      "objective/kl": 16.889467239379883,
      "objective/non_score_reward": -0.8444733619689941,
      "objective/rlhf_reward": -1.4304821593331654,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 78.4537353515625,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.515625,
      "step": 138,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9985227584838867
    },
    {
      "episode": 2240,
      "epoch": 0.013421049478136871,
      "loss/policy_avg": 0.18417471647262573,
      "lr": 9.911170756646218e-06,
      "objective/entropy": 224.734619140625,
      "objective/kl": 33.112342834472656,
      "objective/non_score_reward": -1.6556169986724854,
      "objective/rlhf_reward": -4.889135018984477,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 160.8165283203125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.7109375,
      "step": 139,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9992772340774536
    },
    {
      "episode": 2256,
      "epoch": 0.01351691411726642,
      "loss/policy_avg": 0.40639203786849976,
      "lr": 9.910531697341515e-06,
      "objective/entropy": 69.94343566894531,
      "objective/kl": 24.266616821289062,
      "objective/non_score_reward": -1.2133309841156006,
      "objective/rlhf_reward": -3.40272543868576,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 126.5036392211914,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.5625,
      "step": 140,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.999225378036499
    },
    {
      "episode": 2272,
      "epoch": 0.01361277875639597,
      "loss/policy_avg": 0.28501349687576294,
      "lr": 9.909892638036812e-06,
      "objective/entropy": 61.523101806640625,
      "objective/kl": 17.776689529418945,
      "objective/non_score_reward": -0.8888344764709473,
      "objective/rlhf_reward": -1.8220045725504557,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 87.0567398071289,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.537109375,
      "step": 141,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.000370740890503
    },
    {
      "episode": 2288,
      "epoch": 0.013708643395525518,
      "loss/policy_avg": 0.30668091773986816,
      "lr": 9.909253578732107e-06,
      "objective/entropy": 227.46041870117188,
      "objective/kl": 20.17832374572754,
      "objective/non_score_reward": -1.0089161396026611,
      "objective/rlhf_reward": -2.5198930142247047,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 50.498268127441406,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.685546875,
      "step": 142,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.999216079711914
    },
    {
      "episode": 2304,
      "epoch": 0.013804508034655067,
      "loss/policy_avg": 0.3348355293273926,
      "lr": 9.908614519427404e-06,
      "objective/entropy": 164.50863647460938,
      "objective/kl": 13.646249771118164,
      "objective/non_score_reward": -0.6823124885559082,
      "objective/rlhf_reward": -1.1251298821607407,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 63.31299591064453,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.95703125,
      "step": 143,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9986834526062012
    },
    {
      "episode": 2320,
      "epoch": 0.013900372673784616,
      "loss/policy_avg": 0.7517778277397156,
      "lr": 9.9079754601227e-06,
      "objective/entropy": -69.42684936523438,
      "objective/kl": 13.007519721984863,
      "objective/non_score_reward": -0.6503760814666748,
      "objective/rlhf_reward": -0.2015041172504426,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 15.501136779785156,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.533203125,
      "step": 144,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9969980716705322
    },
    {
      "episode": 2336,
      "epoch": 0.013996237312914165,
      "loss/policy_avg": 0.1666509509086609,
      "lr": 9.907336400817996e-06,
      "objective/entropy": 175.3941192626953,
      "objective/kl": 20.383106231689453,
      "objective/non_score_reward": -1.0191553831100464,
      "objective/rlhf_reward": -2.414762055099593,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 102.40309143066406,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.65625,
      "step": 145,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9960074424743652
    },
    {
      "episode": 2352,
      "epoch": 0.014092101952043714,
      "loss/policy_avg": 0.08111919462680817,
      "lr": 9.906697341513293e-06,
      "objective/entropy": 66.45804595947266,
      "objective/kl": 20.63641357421875,
      "objective/non_score_reward": -1.0318206548690796,
      "objective/rlhf_reward": -2.7680326637968253,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 16.144962310791016,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.44921875,
      "step": 146,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0003509521484375
    },
    {
      "episode": 2368,
      "epoch": 0.014187966591173263,
      "loss/policy_avg": 0.2162848860025406,
      "lr": 9.90605828220859e-06,
      "objective/entropy": 66.34003448486328,
      "objective/kl": 21.03724479675293,
      "objective/non_score_reward": -1.051862359046936,
      "objective/rlhf_reward": -1.8074494361877442,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 56.59767150878906,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.544921875,
      "step": 147,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9967288970947266
    },
    {
      "episode": 2384,
      "epoch": 0.014283831230302812,
      "loss/policy_avg": 0.13452857732772827,
      "lr": 9.905419222903886e-06,
      "objective/entropy": 160.91929626464844,
      "objective/kl": 22.133365631103516,
      "objective/non_score_reward": -1.10666823387146,
      "objective/rlhf_reward": -2.693339631954829,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 64.49358367919922,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.62109375,
      "step": 148,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9988274574279785
    },
    {
      "episode": 2400,
      "epoch": 0.01437969586943236,
      "loss/policy_avg": 1.6826289892196655,
      "lr": 9.904780163599183e-06,
      "objective/entropy": -182.28018188476562,
      "objective/kl": 22.543842315673828,
      "objective/non_score_reward": -1.1271920204162598,
      "objective/rlhf_reward": -3.084936280449001,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 70.59880828857422,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.62890625,
      "step": 149,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0008223056793213
    },
    {
      "episode": 2416,
      "epoch": 0.01447556050856191,
      "loss/policy_avg": 0.4059183597564697,
      "lr": 9.904141104294478e-06,
      "objective/entropy": 225.73135375976562,
      "objective/kl": 23.115840911865234,
      "objective/non_score_reward": -1.1557921171188354,
      "objective/rlhf_reward": -2.8898351351420084,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 45.14168930053711,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.703125,
      "step": 150,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9997532367706299
    },
    {
      "episode": 2432,
      "epoch": 0.01457142514769146,
      "loss/policy_avg": 0.10681919753551483,
      "lr": 9.903502044989775e-06,
      "objective/entropy": 213.69598388671875,
      "objective/kl": 26.178190231323242,
      "objective/non_score_reward": -1.3089096546173096,
      "objective/rlhf_reward": -3.894002726584106,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 92.52935791015625,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.69140625,
      "step": 151,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9975218772888184
    },
    {
      "episode": 2448,
      "epoch": 0.01466728978682101,
      "loss/policy_avg": -0.2853464186191559,
      "lr": 9.902862985685072e-06,
      "objective/entropy": 58.680572509765625,
      "objective/kl": 17.81705665588379,
      "objective/non_score_reward": -0.8908528089523315,
      "objective/rlhf_reward": -0.6396921619188514,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 89.08941650390625,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.669921875,
      "step": 152,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0143747329711914
    },
    {
      "episode": 2464,
      "epoch": 0.014763154425950558,
      "loss/policy_avg": 0.07825072109699249,
      "lr": 9.902223926380369e-06,
      "objective/entropy": 198.86288452148438,
      "objective/kl": 28.436542510986328,
      "objective/non_score_reward": -1.4218271970748901,
      "objective/rlhf_reward": -2.7635896548044414,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 44.41461181640625,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.59375,
      "step": 153,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9994783401489258
    },
    {
      "episode": 2480,
      "epoch": 0.014859019065080107,
      "loss/policy_avg": 0.27155977487564087,
      "lr": 9.901584867075666e-06,
      "objective/entropy": 89.04707336425781,
      "objective/kl": 21.113758087158203,
      "objective/non_score_reward": -1.0556879043579102,
      "objective/rlhf_reward": -1.2990326031458106,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 58.70441818237305,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.623046875,
      "step": 154,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9971623420715332
    },
    {
      "episode": 2496,
      "epoch": 0.014954883704209656,
      "loss/policy_avg": 0.3080964982509613,
      "lr": 9.900945807770961e-06,
      "objective/entropy": 35.38983154296875,
      "objective/kl": 21.02568817138672,
      "objective/non_score_reward": -1.0512844324111938,
      "objective/rlhf_reward": -2.7241851715401406,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 52.82551193237305,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.560546875,
      "step": 155,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9960044622421265
    },
    {
      "episode": 2512,
      "epoch": 0.015050748343339205,
      "loss/policy_avg": 4.562356472015381,
      "lr": 9.900306748466258e-06,
      "objective/entropy": 253.11752319335938,
      "objective/kl": 22.01451301574707,
      "objective/non_score_reward": -1.1007256507873535,
      "objective/rlhf_reward": -2.798782501284199,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 74.26364135742188,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.765625,
      "step": 156,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9965643882751465
    },
    {
      "episode": 2528,
      "epoch": 0.015146612982468754,
      "loss/policy_avg": 0.21197248995304108,
      "lr": 9.899667689161555e-06,
      "objective/entropy": 149.58770751953125,
      "objective/kl": 23.317626953125,
      "objective/non_score_reward": -1.1658812761306763,
      "objective/rlhf_reward": -2.2635251045227047,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 51.574981689453125,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4736328125,
      "step": 157,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.995574951171875
    },
    {
      "episode": 2544,
      "epoch": 0.015242477621598303,
      "loss/policy_avg": 0.20880039036273956,
      "lr": 9.899028629856852e-06,
      "objective/entropy": -64.38532257080078,
      "objective/kl": 25.92443084716797,
      "objective/non_score_reward": -1.2962216138839722,
      "objective/rlhf_reward": -3.784886217117309,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 138.45706176757812,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.568359375,
      "step": 158,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9968822002410889
    },
    {
      "episode": 2560,
      "epoch": 0.015338342260727852,
      "loss/policy_avg": 0.21600359678268433,
      "lr": 9.898389570552149e-06,
      "objective/entropy": 3.545970916748047,
      "objective/kl": 23.09051513671875,
      "objective/non_score_reward": -1.1545257568359375,
      "objective/rlhf_reward": -2.6706922007369354,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 36.885650634765625,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.55859375,
      "step": 159,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9993218183517456
    },
    {
      "episode": 2576,
      "epoch": 0.015434206899857401,
      "loss/policy_avg": 0.5031390190124512,
      "lr": 9.897750511247446e-06,
      "objective/entropy": 98.00604248046875,
      "objective/kl": 25.33047866821289,
      "objective/non_score_reward": -1.2665239572525024,
      "objective/rlhf_reward": -3.4619760847726635,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 83.63774871826172,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.671875,
      "step": 160,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.000274896621704
    },
    {
      "episode": 2592,
      "epoch": 0.01553007153898695,
      "loss/policy_avg": 0.018053412437438965,
      "lr": 9.89711145194274e-06,
      "objective/entropy": 2.8434524536132812,
      "objective/kl": 24.395084381103516,
      "objective/non_score_reward": -1.2197542190551758,
      "objective/rlhf_reward": -3.2171576074963673,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 1.6353378295898438,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.64453125,
      "step": 161,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.001478433609009
    },
    {
      "episode": 2608,
      "epoch": 0.0156259361781165,
      "loss/policy_avg": 0.25576311349868774,
      "lr": 9.896472392638038e-06,
      "objective/entropy": -64.24278259277344,
      "objective/kl": 16.287256240844727,
      "objective/non_score_reward": -0.8143627643585205,
      "objective/rlhf_reward": -1.5241178731123606,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 25.824050903320312,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6953125,
      "step": 162,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9984066486358643
    },
    {
      "episode": 2624,
      "epoch": 0.01572180081724605,
      "loss/policy_avg": 0.2750253677368164,
      "lr": 9.895833333333334e-06,
      "objective/entropy": 170.5203857421875,
      "objective/kl": 35.09113693237305,
      "objective/non_score_reward": -1.7545567750930786,
      "objective/rlhf_reward": -4.094508086086485,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 91.88323974609375,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.76171875,
      "step": 163,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9978190660476685
    },
    {
      "episode": 2640,
      "epoch": 0.0158176654563756,
      "loss/policy_avg": 0.2685161828994751,
      "lr": 9.895194274028631e-06,
      "objective/entropy": 107.911376953125,
      "objective/kl": 21.708637237548828,
      "objective/non_score_reward": -1.0854318141937256,
      "objective/rlhf_reward": -2.8911290570214834,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 48.546165466308594,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.603515625,
      "step": 164,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9939230680465698
    },
    {
      "episode": 2656,
      "epoch": 0.015913530095505148,
      "loss/policy_avg": 0.3802343010902405,
      "lr": 9.894555214723928e-06,
      "objective/entropy": 137.427978515625,
      "objective/kl": 20.673809051513672,
      "objective/non_score_reward": -1.0336904525756836,
      "objective/rlhf_reward": -2.793125978022247,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 36.90850830078125,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.63671875,
      "step": 165,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9987661838531494
    },
    {
      "episode": 2672,
      "epoch": 0.016009394734634697,
      "loss/policy_avg": 0.0008638650178909302,
      "lr": 9.893916155419225e-06,
      "objective/entropy": 159.45681762695312,
      "objective/kl": 20.339492797851562,
      "objective/non_score_reward": -1.016974687576294,
      "objective/rlhf_reward": -2.7086488542303275,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 6.459288597106934,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.515625,
      "step": 166,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9977226257324219
    },
    {
      "episode": 2688,
      "epoch": 0.016105259373764245,
      "loss/policy_avg": 0.3463206887245178,
      "lr": 9.89327709611452e-06,
      "objective/entropy": -75.2735824584961,
      "objective/kl": 27.865215301513672,
      "objective/non_score_reward": -1.3932607173919678,
      "objective/rlhf_reward": -4.173042631149292,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 139.90060424804688,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.587890625,
      "step": 167,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0016684532165527
    },
    {
      "episode": 2704,
      "epoch": 0.016201124012893794,
      "loss/policy_avg": 0.07642253488302231,
      "lr": 9.892638036809815e-06,
      "objective/entropy": 38.99913787841797,
      "objective/kl": 19.061498641967773,
      "objective/non_score_reward": -0.9530749320983887,
      "objective/rlhf_reward": -1.987470920356821,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 22.035629272460938,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.484375,
      "step": 168,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0013604164123535
    },
    {
      "episode": 2720,
      "epoch": 0.016296988652023343,
      "loss/policy_avg": 0.2990867495536804,
      "lr": 9.891998977505112e-06,
      "objective/entropy": 199.7046661376953,
      "objective/kl": 23.46067237854004,
      "objective/non_score_reward": -1.1730337142944336,
      "objective/rlhf_reward": -3.268302519519893,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 19.572267532348633,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.6171875,
      "step": 169,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.998270034790039
    },
    {
      "episode": 2736,
      "epoch": 0.016392853291152892,
      "loss/policy_avg": 0.3040146231651306,
      "lr": 9.89135991820041e-06,
      "objective/entropy": 84.5781021118164,
      "objective/kl": 24.218996047973633,
      "objective/non_score_reward": -1.2109497785568237,
      "objective/rlhf_reward": -2.896387885289128,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 91.4429931640625,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.57421875,
      "step": 170,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0007870197296143
    },
    {
      "episode": 2752,
      "epoch": 0.01648871793028244,
      "loss/policy_avg": 0.24132516980171204,
      "lr": 9.890720858895706e-06,
      "objective/entropy": 25.26891326904297,
      "objective/kl": 12.311616897583008,
      "objective/non_score_reward": -0.6155807971954346,
      "objective/rlhf_reward": -2.4623232781887054,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 4.089572906494141,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6015625,
      "step": 171,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9984550476074219
    },
    {
      "episode": 2768,
      "epoch": 0.01658458256941199,
      "loss/policy_avg": 0.07815683633089066,
      "lr": 9.890081799591003e-06,
      "objective/entropy": -2.7739601135253906,
      "objective/kl": 20.480499267578125,
      "objective/non_score_reward": -1.0240248441696167,
      "objective/rlhf_reward": -2.6151468185738325,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 11.766371726989746,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.52734375,
      "step": 172,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.999391794204712
    },
    {
      "episode": 2784,
      "epoch": 0.01668044720854154,
      "loss/policy_avg": 0.31003671884536743,
      "lr": 9.8894427402863e-06,
      "objective/entropy": -5.804538726806641,
      "objective/kl": 23.551572799682617,
      "objective/non_score_reward": -1.1775786876678467,
      "objective/rlhf_reward": -3.2597167297319025,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 241.19540405273438,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.587890625,
      "step": 173,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9990514516830444
    },
    {
      "episode": 2800,
      "epoch": 0.016776311847671088,
      "loss/policy_avg": 0.027285143733024597,
      "lr": 9.888803680981595e-06,
      "objective/entropy": 91.14071655273438,
      "objective/kl": 19.611085891723633,
      "objective/non_score_reward": -0.9805543422698975,
      "objective/rlhf_reward": -2.44126462471044,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 60.10600662231445,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.537109375,
      "step": 174,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9972784519195557
    },
    {
      "episode": 2816,
      "epoch": 0.016872176486800637,
      "loss/policy_avg": 0.2845172882080078,
      "lr": 9.888164621676892e-06,
      "objective/entropy": 30.190153121948242,
      "objective/kl": 24.783939361572266,
      "objective/non_score_reward": -1.239197015762329,
      "objective/rlhf_reward": -3.578185775367123,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 76.30748748779297,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.443359375,
      "step": 175,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9994440078735352
    },
    {
      "episode": 2832,
      "epoch": 0.016968041125930186,
      "loss/policy_avg": 0.5662503838539124,
      "lr": 9.887525562372189e-06,
      "objective/entropy": 60.807342529296875,
      "objective/kl": 12.370782852172852,
      "objective/non_score_reward": -0.6185390949249268,
      "objective/rlhf_reward": -1.0503242506581225,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 14.155126571655273,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.525390625,
      "step": 176,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9987331628799438
    },
    {
      "episode": 2848,
      "epoch": 0.017063905765059735,
      "loss/policy_avg": 0.08586982637643814,
      "lr": 9.886886503067486e-06,
      "objective/entropy": 43.38105010986328,
      "objective/kl": 24.246856689453125,
      "objective/non_score_reward": -1.2123429775238037,
      "objective/rlhf_reward": -3.470769503203732,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 141.50592041015625,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.64453125,
      "step": 177,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9969239234924316
    },
    {
      "episode": 2864,
      "epoch": 0.017159770404189284,
      "loss/policy_avg": 0.26094895601272583,
      "lr": 9.886247443762783e-06,
      "objective/entropy": 54.85191345214844,
      "objective/kl": 20.912307739257812,
      "objective/non_score_reward": -1.0456154346466064,
      "objective/rlhf_reward": -2.7824616193771363,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 19.43996810913086,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4755859375,
      "step": 178,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0007808208465576
    },
    {
      "episode": 2880,
      "epoch": 0.017255635043318833,
      "loss/policy_avg": -0.0008885636925697327,
      "lr": 9.88560838445808e-06,
      "objective/entropy": 1.5364952087402344,
      "objective/kl": 18.547964096069336,
      "objective/non_score_reward": -0.9273982048034668,
      "objective/rlhf_reward": -1.762181530671056,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 103.84625244140625,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.52734375,
      "step": 179,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0031652450561523
    },
    {
      "episode": 2896,
      "epoch": 0.017351499682448382,
      "loss/policy_avg": 0.07095308601856232,
      "lr": 9.884969325153375e-06,
      "objective/entropy": -57.707908630371094,
      "objective/kl": 17.486156463623047,
      "objective/non_score_reward": -0.8743079304695129,
      "objective/rlhf_reward": -1.3745254895844794,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 35.78956604003906,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.63671875,
      "step": 180,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9995075464248657
    },
    {
      "episode": 2912,
      "epoch": 0.01744736432157793,
      "loss/policy_avg": 0.42247164249420166,
      "lr": 9.884330265848671e-06,
      "objective/entropy": 194.7113037109375,
      "objective/kl": 21.53358268737793,
      "objective/non_score_reward": -1.0766791105270386,
      "objective/rlhf_reward": -2.750457256045893,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 58.89783477783203,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.720703125,
      "step": 181,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.996431827545166
    },
    {
      "episode": 2928,
      "epoch": 0.01754322896070748,
      "loss/policy_avg": 0.3189627528190613,
      "lr": 9.883691206543968e-06,
      "objective/entropy": 125.43355560302734,
      "objective/kl": 20.729223251342773,
      "objective/non_score_reward": -1.0364612340927124,
      "objective/rlhf_reward": -2.767242708293301,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 31.974578857421875,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.60546875,
      "step": 182,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9984747171401978
    },
    {
      "episode": 2944,
      "epoch": 0.01763909359983703,
      "loss/policy_avg": 0.19416040182113647,
      "lr": 9.883052147239265e-06,
      "objective/entropy": 127.4957275390625,
      "objective/kl": 23.107641220092773,
      "objective/non_score_reward": -1.1553820371627808,
      "objective/rlhf_reward": -3.2429258609689295,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 41.45734786987305,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.376953125,
      "step": 183,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.999535322189331
    },
    {
      "episode": 2960,
      "epoch": 0.017734958238966578,
      "loss/policy_avg": 0.04916887357831001,
      "lr": 9.882413087934562e-06,
      "objective/entropy": -16.33904266357422,
      "objective/kl": 15.624849319458008,
      "objective/non_score_reward": -0.7812424898147583,
      "objective/rlhf_reward": -1.002263667360816,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 86.75860595703125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.8203125,
      "step": 184,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9967164993286133
    },
    {
      "episode": 2976,
      "epoch": 0.017830822878096127,
      "loss/policy_avg": 0.15854808688163757,
      "lr": 9.881774028629857e-06,
      "objective/entropy": -9.968147277832031,
      "objective/kl": 20.46514320373535,
      "objective/non_score_reward": -1.0232571363449097,
      "objective/rlhf_reward": -2.35969527165095,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 16.395225524902344,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.5859375,
      "step": 185,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9976359605789185
    },
    {
      "episode": 2992,
      "epoch": 0.017926687517225676,
      "loss/policy_avg": 0.36498603224754333,
      "lr": 9.881134969325154e-06,
      "objective/entropy": 209.59991455078125,
      "objective/kl": 18.690290451049805,
      "objective/non_score_reward": -0.9345145225524902,
      "objective/rlhf_reward": -2.338058030605316,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 12.64120101928711,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.623046875,
      "step": 186,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9994118213653564
    },
    {
      "episode": 3008,
      "epoch": 0.018022552156355228,
      "loss/policy_avg": 0.15073028206825256,
      "lr": 9.880495910020451e-06,
      "objective/entropy": 33.50044250488281,
      "objective/kl": 21.099205017089844,
      "objective/non_score_reward": -1.0549602508544922,
      "objective/rlhf_reward": 0.1801587581634525,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 28.017484664916992,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.666015625,
      "step": 187,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000263214111328
    },
    {
      "episode": 3024,
      "epoch": 0.018118416795484777,
      "loss/policy_avg": 0.04914219304919243,
      "lr": 9.879856850715748e-06,
      "objective/entropy": 109.99685668945312,
      "objective/kl": 23.795440673828125,
      "objective/non_score_reward": -1.1897720098495483,
      "objective/rlhf_reward": -0.3590880990028378,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 17.797225952148438,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.529296875,
      "step": 188,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0002386569976807
    },
    {
      "episode": 3040,
      "epoch": 0.018214281434614326,
      "loss/policy_avg": 0.26782599091529846,
      "lr": 9.879217791411043e-06,
      "objective/entropy": 46.40031051635742,
      "objective/kl": 15.295504570007324,
      "objective/non_score_reward": -0.764775276184082,
      "objective/rlhf_reward": -1.6998512086614799,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 19.033124923706055,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4287109375,
      "step": 189,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0006394386291504
    },
    {
      "episode": 3056,
      "epoch": 0.018310146073743875,
      "loss/policy_avg": -0.0003484562039375305,
      "lr": 9.87857873210634e-06,
      "objective/entropy": -128.13638305664062,
      "objective/kl": 23.236797332763672,
      "objective/non_score_reward": -1.1618399620056152,
      "objective/rlhf_reward": -2.985500340879546,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 122.61852264404297,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.521484375,
      "step": 190,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.998673439025879
    },
    {
      "episode": 3072,
      "epoch": 0.018406010712873424,
      "loss/policy_avg": 0.285878986120224,
      "lr": 9.877939672801637e-06,
      "objective/entropy": -155.79151916503906,
      "objective/kl": 17.15728187561035,
      "objective/non_score_reward": -0.8578640818595886,
      "objective/rlhf_reward": -1.6981231282154718,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 27.024686813354492,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.578125,
      "step": 191,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9977209568023682
    },
    {
      "episode": 3088,
      "epoch": 0.018501875352002973,
      "loss/policy_avg": 0.03845605254173279,
      "lr": 9.877300613496934e-06,
      "objective/entropy": -79.23377227783203,
      "objective/kl": 24.854154586791992,
      "objective/non_score_reward": -1.2427077293395996,
      "objective/rlhf_reward": -3.4145718505054266,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 108.08650970458984,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.54296875,
      "step": 192,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9965288639068604
    },
    {
      "episode": 3104,
      "epoch": 0.018597739991132522,
      "loss/policy_avg": 0.22054271399974823,
      "lr": 9.876661554192229e-06,
      "objective/entropy": 58.46562576293945,
      "objective/kl": 18.69571876525879,
      "objective/non_score_reward": -0.9347859621047974,
      "objective/rlhf_reward": -1.3391437292099,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 17.535587310791016,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.66796875,
      "step": 193,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.996351957321167
    },
    {
      "episode": 3120,
      "epoch": 0.01869360463026207,
      "loss/policy_avg": 0.46004775166511536,
      "lr": 9.876022494887526e-06,
      "objective/entropy": 208.6689453125,
      "objective/kl": 24.537294387817383,
      "objective/non_score_reward": -1.2268648147583008,
      "objective/rlhf_reward": -3.3511998941570074,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 103.11289978027344,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.6171875,
      "step": 194,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9980366230010986
    },
    {
      "episode": 3136,
      "epoch": 0.01878946926939162,
      "loss/policy_avg": 0.14284425973892212,
      "lr": 9.875383435582823e-06,
      "objective/entropy": -140.25045776367188,
      "objective/kl": 21.156387329101562,
      "objective/non_score_reward": -1.0578192472457886,
      "objective/rlhf_reward": -1.8312772423028945,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 95.11038208007812,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.69921875,
      "step": 195,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0021564960479736
    },
    {
      "episode": 3152,
      "epoch": 0.01888533390852117,
      "loss/policy_avg": 0.4036502540111542,
      "lr": 9.87474437627812e-06,
      "objective/entropy": 97.97139739990234,
      "objective/kl": 20.765098571777344,
      "objective/non_score_reward": -1.038254737854004,
      "objective/rlhf_reward": -1.7530193686485291,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 33.61680603027344,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.75,
      "step": 196,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9960455894470215
    },
    {
      "episode": 3168,
      "epoch": 0.018981198547650718,
      "loss/policy_avg": 0.03367016091942787,
      "lr": 9.874105316973416e-06,
      "objective/entropy": 110.7692642211914,
      "objective/kl": 32.466636657714844,
      "objective/non_score_reward": -1.6233320236206055,
      "objective/rlhf_reward": -4.668499465259623,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 22.905399322509766,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.66015625,
      "step": 197,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.000126361846924
    },
    {
      "episode": 3184,
      "epoch": 0.019077063186780267,
      "loss/policy_avg": 0.3382406532764435,
      "lr": 9.873466257668712e-06,
      "objective/entropy": -46.87655258178711,
      "objective/kl": 23.83783531188965,
      "objective/non_score_reward": -1.1918917894363403,
      "objective/rlhf_reward": -3.44205424550168,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 26.46108055114746,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4814453125,
      "step": 198,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9974579811096191
    },
    {
      "episode": 3200,
      "epoch": 0.019172927825909816,
      "loss/policy_avg": 0.05052588880062103,
      "lr": 9.872827198364009e-06,
      "objective/entropy": -62.79549789428711,
      "objective/kl": 19.587276458740234,
      "objective/non_score_reward": -0.9793638586997986,
      "objective/rlhf_reward": -0.9937364205133643,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 37.62165069580078,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.564453125,
      "step": 199,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9972220659255981
    },
    {
      "episode": 3216,
      "epoch": 0.019268792465039365,
      "loss/policy_avg": 0.2230260968208313,
      "lr": 9.872188139059305e-06,
      "objective/entropy": -37.75834655761719,
      "objective/kl": 23.102069854736328,
      "objective/non_score_reward": -1.1551035642623901,
      "objective/rlhf_reward": -3.2787786035830075,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 56.49012756347656,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.583984375,
      "step": 200,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.000248908996582
    },
    {
      "episode": 3232,
      "epoch": 0.019364657104168913,
      "loss/policy_avg": 0.4118785858154297,
      "lr": 9.871549079754602e-06,
      "objective/entropy": 85.49769592285156,
      "objective/kl": 25.69809913635254,
      "objective/non_score_reward": -1.284904956817627,
      "objective/rlhf_reward": -3.5833605816036016,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 56.752174377441406,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.66015625,
      "step": 201,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9987632036209106
    },
    {
      "episode": 3248,
      "epoch": 0.019460521743298462,
      "loss/policy_avg": 0.06031988561153412,
      "lr": 9.8709100204499e-06,
      "objective/entropy": 16.456554412841797,
      "objective/kl": 25.35955047607422,
      "objective/non_score_reward": -1.2679774761199951,
      "objective/rlhf_reward": -3.6213118239358515,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 21.745624542236328,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.529296875,
      "step": 202,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9980382919311523
    },
    {
      "episode": 3264,
      "epoch": 0.01955638638242801,
      "loss/policy_avg": 0.06312263011932373,
      "lr": 9.870270961145196e-06,
      "objective/entropy": 132.99948120117188,
      "objective/kl": 22.432659149169922,
      "objective/non_score_reward": -1.1216330528259277,
      "objective/rlhf_reward": -2.8246725253468616,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 93.43849182128906,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.568359375,
      "step": 203,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9995737075805664
    },
    {
      "episode": 3280,
      "epoch": 0.01965225102155756,
      "loss/policy_avg": 0.6064414978027344,
      "lr": 9.869631901840491e-06,
      "objective/entropy": -19.207683563232422,
      "objective/kl": 18.83993148803711,
      "objective/non_score_reward": -0.9419965744018555,
      "objective/rlhf_reward": -2.3173880978540033,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 90.60572052001953,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.4931640625,
      "step": 204,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.000382661819458
    },
    {
      "episode": 3296,
      "epoch": 0.01974811566068711,
      "loss/policy_avg": 0.2940763831138611,
      "lr": 9.868992842535788e-06,
      "objective/entropy": 83.77371978759766,
      "objective/kl": 25.884700775146484,
      "objective/non_score_reward": -1.2942349910736084,
      "objective/rlhf_reward": -3.3521112903681507,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 39.873409271240234,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.447265625,
      "step": 205,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9972522258758545
    },
    {
      "episode": 3312,
      "epoch": 0.019843980299816658,
      "loss/policy_avg": 0.18257562816143036,
      "lr": 9.868353783231085e-06,
      "objective/entropy": 119.6646728515625,
      "objective/kl": 27.568458557128906,
      "objective/non_score_reward": -1.3784228563308716,
      "objective/rlhf_reward": -1.1136915445327755,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 48.24208068847656,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.66796875,
      "step": 206,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9987564086914062
    },
    {
      "episode": 3328,
      "epoch": 0.019939844938946207,
      "loss/policy_avg": -0.011964879930019379,
      "lr": 9.867714723926382e-06,
      "objective/entropy": 79.78416442871094,
      "objective/kl": 24.409799575805664,
      "objective/non_score_reward": -1.2204899787902832,
      "objective/rlhf_reward": -3.5033578658975184,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 19.269145965576172,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4658203125,
      "step": 207,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.000608205795288
    },
    {
      "episode": 3344,
      "epoch": 0.020035709578075756,
      "loss/policy_avg": 0.04908262565732002,
      "lr": 9.867075664621679e-06,
      "objective/entropy": 174.413818359375,
      "objective/kl": 24.83539581298828,
      "objective/non_score_reward": -1.241769790649414,
      "objective/rlhf_reward": -3.3629594779649548,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 14.995980262756348,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.54296875,
      "step": 208,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9985637664794922
    },
    {
      "episode": 3360,
      "epoch": 0.020131574217205305,
      "loss/policy_avg": 0.14710021018981934,
      "lr": 9.866436605316974e-06,
      "objective/entropy": 132.51194763183594,
      "objective/kl": 29.743432998657227,
      "objective/non_score_reward": -1.4871716499328613,
      "objective/rlhf_reward": -4.344566795889454,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 65.08041381835938,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.490234375,
      "step": 209,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0021185874938965
    },
    {
      "episode": 3376,
      "epoch": 0.020227438856334854,
      "loss/policy_avg": 0.0796532854437828,
      "lr": 9.86579754601227e-06,
      "objective/entropy": 1.3461151123046875,
      "objective/kl": 26.279298782348633,
      "objective/non_score_reward": -1.313965082168579,
      "objective/rlhf_reward": -0.8558599710464474,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 105.49284362792969,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5859375,
      "step": 210,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9989545345306396
    },
    {
      "episode": 3392,
      "epoch": 0.020323303495464403,
      "loss/policy_avg": -0.03664415329694748,
      "lr": 9.865158486707568e-06,
      "objective/entropy": -37.266082763671875,
      "objective/kl": 19.48423957824707,
      "objective/non_score_reward": -0.9742119908332825,
      "objective/rlhf_reward": -0.9731288298380103,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 8.304027557373047,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.638671875,
      "step": 211,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.003216028213501
    },
    {
      "episode": 3408,
      "epoch": 0.020419168134593952,
      "loss/policy_avg": 0.30985838174819946,
      "lr": 9.864519427402863e-06,
      "objective/entropy": 94.80859375,
      "objective/kl": 29.94342041015625,
      "objective/non_score_reward": -1.4971709251403809,
      "objective/rlhf_reward": -4.564851482112971,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 115.7642593383789,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.74609375,
      "step": 212,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9959279298782349
    },
    {
      "episode": 3424,
      "epoch": 0.0205150327737235,
      "loss/policy_avg": 0.23234406113624573,
      "lr": 9.86388036809816e-06,
      "objective/entropy": 125.32878875732422,
      "objective/kl": 33.22450637817383,
      "objective/non_score_reward": -1.6612253189086914,
      "objective/rlhf_reward": -4.820072407993387,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 82.43852233886719,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.58203125,
      "step": 213,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.001713752746582
    },
    {
      "episode": 3440,
      "epoch": 0.02061089741285305,
      "loss/policy_avg": 1.5097947120666504,
      "lr": 9.863241308793457e-06,
      "objective/entropy": 132.66845703125,
      "objective/kl": 27.622318267822266,
      "objective/non_score_reward": -1.3811159133911133,
      "objective/rlhf_reward": -3.6996345475044956,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 26.179336547851562,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.708984375,
      "step": 214,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9993813037872314
    },
    {
      "episode": 3456,
      "epoch": 0.0207067620519826,
      "loss/policy_avg": 0.12209601700305939,
      "lr": 9.862602249488753e-06,
      "objective/entropy": 132.88406372070312,
      "objective/kl": 26.24971580505371,
      "objective/non_score_reward": -1.312485694885254,
      "objective/rlhf_reward": -5.249942898750305,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 41.524139404296875,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.7109375,
      "step": 215,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9990661144256592
    },
    {
      "episode": 3472,
      "epoch": 0.02080262669111215,
      "loss/policy_avg": 0.3654727339744568,
      "lr": 9.86196319018405e-06,
      "objective/entropy": 39.344974517822266,
      "objective/kl": 23.619754791259766,
      "objective/non_score_reward": -1.18098783493042,
      "objective/rlhf_reward": -1.8002320870172706,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 5.19040584564209,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4951171875,
      "step": 216,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9990694522857666
    },
    {
      "episode": 3488,
      "epoch": 0.0208984913302417,
      "loss/policy_avg": 0.05907230079174042,
      "lr": 9.861324130879346e-06,
      "objective/entropy": -49.055564880371094,
      "objective/kl": 27.70423126220703,
      "objective/non_score_reward": -1.3852115869522095,
      "objective/rlhf_reward": -3.8789869598752125,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 62.16511917114258,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.609375,
      "step": 217,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9973804950714111
    },
    {
      "episode": 3504,
      "epoch": 0.02099435596937125,
      "loss/policy_avg": 0.5758800506591797,
      "lr": 9.860685071574642e-06,
      "objective/entropy": 18.1787166595459,
      "objective/kl": 25.688358306884766,
      "objective/non_score_reward": -1.2844178676605225,
      "objective/rlhf_reward": -2.2139523147952285,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 23.39984130859375,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.498046875,
      "step": 218,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9974064826965332
    },
    {
      "episode": 3520,
      "epoch": 0.021090220608500798,
      "loss/policy_avg": 0.2610527575016022,
      "lr": 9.86004601226994e-06,
      "objective/entropy": -68.09791564941406,
      "objective/kl": 26.7615966796875,
      "objective/non_score_reward": -1.3380796909332275,
      "objective/rlhf_reward": -4.026806149512453,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 124.13450622558594,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4599609375,
      "step": 219,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9986741542816162
    },
    {
      "episode": 3536,
      "epoch": 0.021186085247630347,
      "loss/policy_avg": 0.1624567210674286,
      "lr": 9.859406952965236e-06,
      "objective/entropy": -113.99856567382812,
      "objective/kl": 19.689868927001953,
      "objective/non_score_reward": -0.9844935536384583,
      "objective/rlhf_reward": -2.113145466121744,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 45.295875549316406,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.70703125,
      "step": 220,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0004196166992188
    },
    {
      "episode": 3552,
      "epoch": 0.021281949886759896,
      "loss/policy_avg": 0.13548433780670166,
      "lr": 9.858767893660533e-06,
      "objective/entropy": 154.66708374023438,
      "objective/kl": 31.08365249633789,
      "objective/non_score_reward": -1.554182529449463,
      "objective/rlhf_reward": -4.554870968282805,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 43.560997009277344,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7265625,
      "step": 221,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9972419738769531
    },
    {
      "episode": 3568,
      "epoch": 0.021377814525889445,
      "loss/policy_avg": 0.04025420919060707,
      "lr": 9.858128834355828e-06,
      "objective/entropy": 145.02468872070312,
      "objective/kl": 31.459678649902344,
      "objective/non_score_reward": -1.572983980178833,
      "objective/rlhf_reward": -4.932686292861385,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 41.05935287475586,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4560546875,
      "step": 222,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.0009706020355225
    },
    {
      "episode": 3584,
      "epoch": 0.021473679165018994,
      "loss/policy_avg": 1.5885295867919922,
      "lr": 9.857489775051125e-06,
      "objective/entropy": 141.5781707763672,
      "objective/kl": 34.53314971923828,
      "objective/non_score_reward": -1.726657509803772,
      "objective/rlhf_reward": -5.244770532072174,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 37.03607177734375,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.658203125,
      "step": 223,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.992570400238037
    },
    {
      "episode": 3600,
      "epoch": 0.021569543804148543,
      "loss/policy_avg": 0.9811650514602661,
      "lr": 9.856850715746422e-06,
      "objective/entropy": -30.946441650390625,
      "objective/kl": 29.145998001098633,
      "objective/non_score_reward": -1.4572999477386475,
      "objective/rlhf_reward": -4.450597622481686,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 19.481060028076172,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.462890625,
      "step": 224,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9983983039855957
    },
    {
      "episode": 3616,
      "epoch": 0.021665408443278092,
      "loss/policy_avg": 0.5196128487586975,
      "lr": 9.856211656441719e-06,
      "objective/entropy": -16.55962371826172,
      "objective/kl": 28.4706974029541,
      "objective/non_score_reward": -1.423534870147705,
      "objective/rlhf_reward": -3.5714332482972484,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 117.12289428710938,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.732421875,
      "step": 225,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9975980520248413
    },
    {
      "episode": 3632,
      "epoch": 0.02176127308240764,
      "loss/policy_avg": 0.6528609395027161,
      "lr": 9.855572597137016e-06,
      "objective/entropy": 136.64077758789062,
      "objective/kl": 32.46646499633789,
      "objective/non_score_reward": -1.6233232021331787,
      "objective/rlhf_reward": -2.093292927742004,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 44.35145950317383,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.669921875,
      "step": 226,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9994146823883057
    },
    {
      "episode": 3648,
      "epoch": 0.02185713772153719,
      "loss/policy_avg": 0.9434906244277954,
      "lr": 9.854933537832313e-06,
      "objective/entropy": -36.75615310668945,
      "objective/kl": 31.890575408935547,
      "objective/non_score_reward": -1.5945286750793457,
      "objective/rlhf_reward": -5.052601966887636,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 65.19577026367188,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.59375,
      "step": 227,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9979374408721924
    },
    {
      "episode": 3664,
      "epoch": 0.02195300236066674,
      "loss/policy_avg": 0.36130765080451965,
      "lr": 9.854294478527608e-06,
      "objective/entropy": 47.61101531982422,
      "objective/kl": 18.669593811035156,
      "objective/non_score_reward": -0.9334796071052551,
      "objective/rlhf_reward": -2.3339184284210206,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 15.266149520874023,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.578125,
      "step": 228,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9959979057312012
    },
    {
      "episode": 3680,
      "epoch": 0.022048866999796288,
      "loss/policy_avg": 0.18321090936660767,
      "lr": 9.853655419222905e-06,
      "objective/entropy": 116.60293579101562,
      "objective/kl": 27.56112289428711,
      "objective/non_score_reward": -1.378056287765503,
      "objective/rlhf_reward": -3.5648136837052657,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 29.471284866333008,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.537109375,
      "step": 229,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9991540908813477
    },
    {
      "episode": 3696,
      "epoch": 0.022144731638925837,
      "loss/policy_avg": -0.044996485114097595,
      "lr": 9.853016359918202e-06,
      "objective/entropy": 38.275238037109375,
      "objective/kl": 28.720836639404297,
      "objective/non_score_reward": -1.4360418319702148,
      "objective/rlhf_reward": -4.187907754388407,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 173.6102752685547,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.552734375,
      "step": 230,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.997882604598999
    },
    {
      "episode": 3712,
      "epoch": 0.022240596278055386,
      "loss/policy_avg": 0.027855467051267624,
      "lr": 9.852377300613498e-06,
      "objective/entropy": 123.59611511230469,
      "objective/kl": 30.175601959228516,
      "objective/non_score_reward": -1.5087801218032837,
      "objective/rlhf_reward": -4.478861062732294,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 50.733642578125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.37109375,
      "step": 231,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.0003466606140137
    },
    {
      "episode": 3728,
      "epoch": 0.022336460917184935,
      "loss/policy_avg": -0.3093503713607788,
      "lr": 9.851738241308795e-06,
      "objective/entropy": 0.438995361328125,
      "objective/kl": 27.025171279907227,
      "objective/non_score_reward": -1.3512585163116455,
      "objective/rlhf_reward": -5.405034303665161,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 13.092641830444336,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.615234375,
      "step": 232,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.000675678253174
    },
    {
      "episode": 3744,
      "epoch": 0.022432325556314484,
      "loss/policy_avg": -0.05236402899026871,
      "lr": 9.85109918200409e-06,
      "objective/entropy": 112.74819946289062,
      "objective/kl": 24.94538688659668,
      "objective/non_score_reward": -1.2472693920135498,
      "objective/rlhf_reward": -3.473305845054325,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 19.200075149536133,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.3583984375,
      "step": 233,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.002117395401001
    },
    {
      "episode": 3760,
      "epoch": 0.022528190195444033,
      "loss/policy_avg": 0.21103611588478088,
      "lr": 9.850460122699387e-06,
      "objective/entropy": 73.77043151855469,
      "objective/kl": 28.00216293334961,
      "objective/non_score_reward": -1.4001080989837646,
      "objective/rlhf_reward": -3.6530211669968917,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 13.291183471679688,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5009765625,
      "step": 234,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9995331764221191
    },
    {
      "episode": 3776,
      "epoch": 0.02262405483457358,
      "loss/policy_avg": 0.6418443918228149,
      "lr": 9.849821063394683e-06,
      "objective/entropy": 19.92426300048828,
      "objective/kl": 31.282997131347656,
      "objective/non_score_reward": -1.5641499757766724,
      "objective/rlhf_reward": -4.931086901456041,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 98.59768676757812,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.34375,
      "step": 235,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0005524158477783
    },
    {
      "episode": 3792,
      "epoch": 0.02271991947370313,
      "loss/policy_avg": 0.20836295187473297,
      "lr": 9.84918200408998e-06,
      "objective/entropy": 28.238201141357422,
      "objective/kl": 29.105060577392578,
      "objective/non_score_reward": -1.455253005027771,
      "objective/rlhf_reward": -4.264752714839533,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 34.374176025390625,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.51171875,
      "step": 236,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9989919662475586
    },
    {
      "episode": 3808,
      "epoch": 0.02281578411283268,
      "loss/policy_avg": 0.43571943044662476,
      "lr": 9.848542944785276e-06,
      "objective/entropy": 144.94302368164062,
      "objective/kl": 33.369178771972656,
      "objective/non_score_reward": -1.6684589385986328,
      "objective/rlhf_reward": -5.314585768912716,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 113.68771362304688,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.607421875,
      "step": 237,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.996192216873169
    },
    {
      "episode": 3824,
      "epoch": 0.02291164875196223,
      "loss/policy_avg": 0.14893671870231628,
      "lr": 9.847903885480573e-06,
      "objective/entropy": 186.38681030273438,
      "objective/kl": 41.077842712402344,
      "objective/non_score_reward": -2.0538923740386963,
      "objective/rlhf_reward": -6.611449215475636,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 168.3666229248047,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.755859375,
      "step": 238,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9984937906265259
    },
    {
      "episode": 3840,
      "epoch": 0.023007513391091777,
      "loss/policy_avg": 0.07648584991693497,
      "lr": 9.84726482617587e-06,
      "objective/entropy": -37.23631286621094,
      "objective/kl": 25.318248748779297,
      "objective/non_score_reward": -1.2659125328063965,
      "objective/rlhf_reward": -3.5073907067447454,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 50.266414642333984,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.48828125,
      "step": 239,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9979995489120483
    },
    {
      "episode": 3856,
      "epoch": 0.023103378030221326,
      "loss/policy_avg": -0.15926438570022583,
      "lr": 9.846625766871167e-06,
      "objective/entropy": 37.868736267089844,
      "objective/kl": 27.493305206298828,
      "objective/non_score_reward": -1.3746652603149414,
      "objective/rlhf_reward": -4.173148546248598,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 6.63505220413208,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.5390625,
      "step": 240,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0081896781921387
    },
    {
      "episode": 3872,
      "epoch": 0.023199242669350875,
      "loss/policy_avg": 0.14562831819057465,
      "lr": 9.845986707566462e-06,
      "objective/entropy": 15.188220977783203,
      "objective/kl": 28.046958923339844,
      "objective/non_score_reward": -1.4023480415344238,
      "objective/rlhf_reward": -4.1587937875703425,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 43.238990783691406,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.578125,
      "step": 241,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.996706247329712
    },
    {
      "episode": 3888,
      "epoch": 0.023295107308480424,
      "loss/policy_avg": 0.11054911464452744,
      "lr": 9.845347648261759e-06,
      "objective/entropy": 65.03858947753906,
      "objective/kl": 30.087387084960938,
      "objective/non_score_reward": -1.5043694972991943,
      "objective/rlhf_reward": -4.070066402630742,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 3.83949613571167,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.587890625,
      "step": 242,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9988956451416016
    },
    {
      "episode": 3904,
      "epoch": 0.023390971947609973,
      "loss/policy_avg": 0.3941475749015808,
      "lr": 9.844708588957056e-06,
      "objective/entropy": 59.93316650390625,
      "objective/kl": 25.623512268066406,
      "objective/non_score_reward": -1.2811756134033203,
      "objective/rlhf_reward": -3.52058264977129,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 78.30380249023438,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.5859375,
      "step": 243,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9990283250808716
    },
    {
      "episode": 3920,
      "epoch": 0.023486836586739522,
      "loss/policy_avg": 0.19095474481582642,
      "lr": 9.844069529652353e-06,
      "objective/entropy": 31.422988891601562,
      "objective/kl": 24.865825653076172,
      "objective/non_score_reward": -1.2432913780212402,
      "objective/rlhf_reward": -3.2398319403330484,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 38.12981033325195,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.53125,
      "step": 244,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.004727840423584
    },
    {
      "episode": 3936,
      "epoch": 0.023582701225869074,
      "loss/policy_avg": 0.049357250332832336,
      "lr": 9.84343047034765e-06,
      "objective/entropy": 21.297576904296875,
      "objective/kl": 35.60150146484375,
      "objective/non_score_reward": -1.7800750732421875,
      "objective/rlhf_reward": -5.720300531387329,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 38.869449615478516,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4716796875,
      "step": 245,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0019991397857666
    },
    {
      "episode": 3952,
      "epoch": 0.023678565864998623,
      "loss/policy_avg": 0.7713517546653748,
      "lr": 9.842791411042945e-06,
      "objective/entropy": 53.62720489501953,
      "objective/kl": 31.218942642211914,
      "objective/non_score_reward": -1.5609471797943115,
      "objective/rlhf_reward": -4.296377490239079,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 48.73869323730469,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7265625,
      "step": 246,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9975836277008057
    },
    {
      "episode": 3968,
      "epoch": 0.023774430504128172,
      "loss/policy_avg": 0.008143262937664986,
      "lr": 9.842152351738242e-06,
      "objective/entropy": 171.02789306640625,
      "objective/kl": 34.79176330566406,
      "objective/non_score_reward": -1.7395880222320557,
      "objective/rlhf_reward": -5.296492939413176,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 21.7828369140625,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.57421875,
      "step": 247,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9989376068115234
    },
    {
      "episode": 3984,
      "epoch": 0.02387029514325772,
      "loss/policy_avg": -0.12264247238636017,
      "lr": 9.841513292433539e-06,
      "objective/entropy": 80.24577331542969,
      "objective/kl": 33.11949920654297,
      "objective/non_score_reward": -1.6559748649597168,
      "objective/rlhf_reward": -4.799071069034646,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 61.87395477294922,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.4599609375,
      "step": 248,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.003005027770996
    },
    {
      "episode": 4000,
      "epoch": 0.02396615978238727,
      "loss/policy_avg": 0.2658330202102661,
      "lr": 9.840874233128836e-06,
      "objective/entropy": 149.58941650390625,
      "objective/kl": 29.3863525390625,
      "objective/non_score_reward": -1.4693175554275513,
      "objective/rlhf_reward": -4.273150358263569,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 58.66055679321289,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.51171875,
      "step": 249,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9972658157348633
    },
    {
      "episode": 4016,
      "epoch": 0.02406202442151682,
      "loss/policy_avg": 0.09115779399871826,
      "lr": 9.840235173824132e-06,
      "objective/entropy": 147.28927612304688,
      "objective/kl": 31.492679595947266,
      "objective/non_score_reward": -1.5746338367462158,
      "objective/rlhf_reward": -4.939285838340206,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 28.799278259277344,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.796875,
      "step": 250,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.002530574798584
    },
    {
      "episode": 4032,
      "epoch": 0.024157889060646368,
      "loss/policy_avg": 0.09398385882377625,
      "lr": 9.83959611451943e-06,
      "objective/entropy": -45.248435974121094,
      "objective/kl": 28.402175903320312,
      "objective/non_score_reward": -1.4201087951660156,
      "objective/rlhf_reward": -4.018575882137405,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 19.838550567626953,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.517578125,
      "step": 251,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9973764419555664
    },
    {
      "episode": 4048,
      "epoch": 0.024253753699775917,
      "loss/policy_avg": 0.19270983338356018,
      "lr": 9.838957055214724e-06,
      "objective/entropy": 77.1705093383789,
      "objective/kl": 34.050987243652344,
      "objective/non_score_reward": -1.7025493383407593,
      "objective/rlhf_reward": -5.076863960425058,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 18.725093841552734,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4814453125,
      "step": 252,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.001063823699951
    },
    {
      "episode": 4064,
      "epoch": 0.024349618338905466,
      "loss/policy_avg": 0.4652649164199829,
      "lr": 9.838317995910021e-06,
      "objective/entropy": 257.7345886230469,
      "objective/kl": 24.133747100830078,
      "objective/non_score_reward": -1.2066874504089355,
      "objective/rlhf_reward": -3.4481475735581935,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 41.46368408203125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.80078125,
      "step": 253,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9955878257751465
    },
    {
      "episode": 4080,
      "epoch": 0.024445482978035015,
      "loss/policy_avg": 0.14692571759223938,
      "lr": 9.837678936605318e-06,
      "objective/entropy": 43.00188064575195,
      "objective/kl": 24.73518180847168,
      "objective/non_score_reward": -1.236759066581726,
      "objective/rlhf_reward": -3.568433978644711,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 75.05264282226562,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7421875,
      "step": 254,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9990639686584473
    },
    {
      "episode": 4096,
      "epoch": 0.024541347617164564,
      "loss/policy_avg": 0.08271847665309906,
      "lr": 9.837039877300615e-06,
      "objective/entropy": -79.57066345214844,
      "objective/kl": 26.90784454345703,
      "objective/non_score_reward": -1.3453922271728516,
      "objective/rlhf_reward": -3.648235575358073,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 24.23294448852539,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.619140625,
      "step": 255,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9984873533248901
    },
    {
      "episode": 4112,
      "epoch": 0.024637212256294113,
      "loss/policy_avg": 0.12403183430433273,
      "lr": 9.83640081799591e-06,
      "objective/entropy": 87.87326049804688,
      "objective/kl": 29.708419799804688,
      "objective/non_score_reward": -1.4854209423065186,
      "objective/rlhf_reward": -4.116855438026499,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 32.65428161621094,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.62890625,
      "step": 256,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9981478452682495
    },
    {
      "episode": 4128,
      "epoch": 0.024733076895423662,
      "loss/policy_avg": -0.17764857411384583,
      "lr": 9.835761758691207e-06,
      "objective/entropy": 130.6345977783203,
      "objective/kl": 34.35237121582031,
      "objective/non_score_reward": -1.717618465423584,
      "objective/rlhf_reward": -5.314214794841364,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 118.99533081054688,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.3974609375,
      "step": 257,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.014057159423828
    },
    {
      "episode": 4144,
      "epoch": 0.02482894153455321,
      "loss/policy_avg": 2.400163173675537,
      "lr": 9.835122699386504e-06,
      "objective/entropy": 123.72301483154297,
      "objective/kl": 21.25601577758789,
      "objective/non_score_reward": -1.0628007650375366,
      "objective/rlhf_reward": 0.1487968802452091,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 36.07887268066406,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.572265625,
      "step": 258,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.998108148574829
    },
    {
      "episode": 4160,
      "epoch": 0.02492480617368276,
      "loss/policy_avg": 0.3900964856147766,
      "lr": 9.8344836400818e-06,
      "objective/entropy": 233.3748321533203,
      "objective/kl": 42.447425842285156,
      "objective/non_score_reward": -2.1223714351654053,
      "objective/rlhf_reward": -5.5657667263757915,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 19.722026824951172,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.74609375,
      "step": 259,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.000584363937378
    },
    {
      "episode": 4176,
      "epoch": 0.02502067081281231,
      "loss/policy_avg": 0.3361247181892395,
      "lr": 9.833844580777096e-06,
      "objective/entropy": 135.13961791992188,
      "objective/kl": 31.25783920288086,
      "objective/non_score_reward": -1.5628920793533325,
      "objective/rlhf_reward": -4.426739449771952,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 16.49414825439453,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.486328125,
      "step": 260,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9986810684204102
    },
    {
      "episode": 4192,
      "epoch": 0.025116535451941858,
      "loss/policy_avg": 0.1438344419002533,
      "lr": 9.833205521472393e-06,
      "objective/entropy": 104.18168640136719,
      "objective/kl": 35.72525405883789,
      "objective/non_score_reward": -1.7862627506256104,
      "objective/rlhf_reward": -5.320222015651773,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 22.100770950317383,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.65625,
      "step": 261,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9996044635772705
    },
    {
      "episode": 4208,
      "epoch": 0.025212400091071407,
      "loss/policy_avg": 2.402132034301758,
      "lr": 9.83256646216769e-06,
      "objective/entropy": 91.16908264160156,
      "objective/kl": 29.633235931396484,
      "objective/non_score_reward": -1.4816619157791138,
      "objective/rlhf_reward": -4.476049522967681,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 43.586891174316406,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.64453125,
      "step": 262,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.004427433013916
    },
    {
      "episode": 4224,
      "epoch": 0.025308264730200956,
      "loss/policy_avg": 0.7259080410003662,
      "lr": 9.831927402862987e-06,
      "objective/entropy": 154.68115234375,
      "objective/kl": 37.00696563720703,
      "objective/non_score_reward": -1.8503483533859253,
      "objective/rlhf_reward": -5.576564307483743,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 16.052043914794922,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.484375,
      "step": 263,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9974215030670166
    },
    {
      "episode": 4240,
      "epoch": 0.025404129369330505,
      "loss/policy_avg": 0.09373458474874496,
      "lr": 9.831288343558284e-06,
      "objective/entropy": 72.85606384277344,
      "objective/kl": 27.522302627563477,
      "objective/non_score_reward": -1.376115083694458,
      "objective/rlhf_reward": -3.679631943973612,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 142.1138916015625,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.310546875,
      "step": 264,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9991629123687744
    },
    {
      "episode": 4256,
      "epoch": 0.025499994008460054,
      "loss/policy_avg": 0.7555310130119324,
      "lr": 9.830649284253579e-06,
      "objective/entropy": 72.61222076416016,
      "objective/kl": 30.647029876708984,
      "objective/non_score_reward": -1.5323514938354492,
      "objective/rlhf_reward": -4.705573756893244,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 54.394874572753906,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.587890625,
      "step": 265,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0006964206695557
    },
    {
      "episode": 4272,
      "epoch": 0.025595858647589603,
      "loss/policy_avg": 0.6551899313926697,
      "lr": 9.830010224948876e-06,
      "objective/entropy": 121.19924926757812,
      "objective/kl": 33.96527099609375,
      "objective/non_score_reward": -1.6982636451721191,
      "objective/rlhf_reward": -5.131195192754852,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 40.39656066894531,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.470703125,
      "step": 266,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.999192476272583
    },
    {
      "episode": 4288,
      "epoch": 0.02569172328671915,
      "loss/policy_avg": 1.1016074419021606,
      "lr": 9.829371165644173e-06,
      "objective/entropy": 132.00601196289062,
      "objective/kl": 43.09049987792969,
      "objective/non_score_reward": -2.154524803161621,
      "objective/rlhf_reward": -7.102327191623386,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 126.27546691894531,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.3935546875,
      "step": 267,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9990718364715576
    },
    {
      "episode": 4304,
      "epoch": 0.0257875879258487,
      "loss/policy_avg": 0.08981708437204361,
      "lr": 9.82873210633947e-06,
      "objective/entropy": 140.80239868164062,
      "objective/kl": 26.626178741455078,
      "objective/non_score_reward": -1.3313090801239014,
      "objective/rlhf_reward": -0.9252360224723812,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 84.53665924072266,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.841796875,
      "step": 268,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.997150182723999
    },
    {
      "episode": 4320,
      "epoch": 0.02588345256497825,
      "loss/policy_avg": 0.565528929233551,
      "lr": 9.828093047034766e-06,
      "objective/entropy": 138.6593017578125,
      "objective/kl": 32.08763885498047,
      "objective/non_score_reward": -1.604382038116455,
      "objective/rlhf_reward": -4.813408408228474,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 34.42543029785156,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.427734375,
      "step": 269,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.0018558502197266
    },
    {
      "episode": 4336,
      "epoch": 0.0259793172041078,
      "loss/policy_avg": 0.4312899708747864,
      "lr": 9.827453987730061e-06,
      "objective/entropy": 20.17654800415039,
      "objective/kl": 23.528181076049805,
      "objective/non_score_reward": -1.176409125328064,
      "objective/rlhf_reward": -2.5829304478326183,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 20.440711975097656,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.7890625,
      "step": 270,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9989676475524902
    },
    {
      "episode": 4352,
      "epoch": 0.026075181843237347,
      "loss/policy_avg": 0.20729105174541473,
      "lr": 9.826814928425358e-06,
      "objective/entropy": 166.21115112304688,
      "objective/kl": 31.01326560974121,
      "objective/non_score_reward": -1.5506633520126343,
      "objective/rlhf_reward": -6.202653288841248,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 34.41830825805664,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.654296875,
      "step": 271,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.003887176513672
    },
    {
      "episode": 4368,
      "epoch": 0.026171046482366896,
      "loss/policy_avg": 3.2944061756134033,
      "lr": 9.826175869120655e-06,
      "objective/entropy": 28.755096435546875,
      "objective/kl": 31.482175827026367,
      "objective/non_score_reward": -1.5741088390350342,
      "objective/rlhf_reward": -4.917833187667233,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 5.366632461547852,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.3701171875,
      "step": 272,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0053317546844482
    },
    {
      "episode": 4384,
      "epoch": 0.02626691112149645,
      "loss/policy_avg": 0.23004142940044403,
      "lr": 9.825536809815952e-06,
      "objective/entropy": 54.82402038574219,
      "objective/kl": 32.45307922363281,
      "objective/non_score_reward": -1.6226541996002197,
      "objective/rlhf_reward": -5.148980966120391,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 31.775432586669922,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.525390625,
      "step": 273,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9995795488357544
    },
    {
      "episode": 4400,
      "epoch": 0.026362775760625998,
      "loss/policy_avg": -0.08435960114002228,
      "lr": 9.824897750511249e-06,
      "objective/entropy": 98.25897216796875,
      "objective/kl": 28.68474578857422,
      "objective/non_score_reward": -1.4342372417449951,
      "objective/rlhf_reward": -5.73694920539856,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 72.97157287597656,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.517578125,
      "step": 274,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0012598037719727
    },
    {
      "episode": 4416,
      "epoch": 0.026458640399755547,
      "loss/policy_avg": 0.41626134514808655,
      "lr": 9.824258691206546e-06,
      "objective/entropy": 83.60694885253906,
      "objective/kl": 30.977035522460938,
      "objective/non_score_reward": -1.548851728439331,
      "objective/rlhf_reward": -4.795407152175903,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 39.04691696166992,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.60546875,
      "step": 275,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0021119117736816
    },
    {
      "episode": 4432,
      "epoch": 0.026554505038885096,
      "loss/policy_avg": 0.43957769870758057,
      "lr": 9.823619631901841e-06,
      "objective/entropy": 127.34529113769531,
      "objective/kl": 35.28544616699219,
      "objective/non_score_reward": -1.7642724514007568,
      "objective/rlhf_reward": -5.606491903872833,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 150.78646850585938,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.748046875,
      "step": 276,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9970619678497314
    },
    {
      "episode": 4448,
      "epoch": 0.026650369678014645,
      "loss/policy_avg": 0.8086847066879272,
      "lr": 9.822980572597138e-06,
      "objective/entropy": -119.74644470214844,
      "objective/kl": 26.706302642822266,
      "objective/non_score_reward": -1.335315227508545,
      "objective/rlhf_reward": -3.9412606716156002,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 65.78569793701172,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.568359375,
      "step": 277,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9986886978149414
    },
    {
      "episode": 4464,
      "epoch": 0.026746234317144194,
      "loss/policy_avg": 0.09760895371437073,
      "lr": 9.822341513292433e-06,
      "objective/entropy": 209.31890869140625,
      "objective/kl": 41.666831970214844,
      "objective/non_score_reward": -2.083341598510742,
      "objective/rlhf_reward": -6.7292466498056225,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 14.525606155395508,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6328125,
      "step": 278,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9980053901672363
    },
    {
      "episode": 4480,
      "epoch": 0.026842098956273742,
      "loss/policy_avg": 0.0820450559258461,
      "lr": 9.82170245398773e-06,
      "objective/entropy": 152.01095581054688,
      "objective/kl": 29.104724884033203,
      "objective/non_score_reward": -1.4552361965179443,
      "objective/rlhf_reward": -4.159085219324218,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 21.12679100036621,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4560546875,
      "step": 279,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.99649977684021
    },
    {
      "episode": 4496,
      "epoch": 0.02693796359540329,
      "loss/policy_avg": 0.08112587034702301,
      "lr": 9.821063394683027e-06,
      "objective/entropy": 49.22539138793945,
      "objective/kl": 32.40191650390625,
      "objective/non_score_reward": -1.6200958490371704,
      "objective/rlhf_reward": -5.029785375209197,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 4.874902725219727,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.404296875,
      "step": 280,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0027780532836914
    },
    {
      "episode": 4512,
      "epoch": 0.02703382823453284,
      "loss/policy_avg": 0.41851094365119934,
      "lr": 9.820424335378324e-06,
      "objective/entropy": 108.13827514648438,
      "objective/kl": 44.792015075683594,
      "objective/non_score_reward": -2.239600658416748,
      "objective/rlhf_reward": -7.133574362072061,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 67.72032165527344,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.537109375,
      "step": 281,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9979641437530518
    },
    {
      "episode": 4528,
      "epoch": 0.02712969287366239,
      "loss/policy_avg": 0.8327301144599915,
      "lr": 9.81978527607362e-06,
      "objective/entropy": 70.98486328125,
      "objective/kl": 43.82145690917969,
      "objective/non_score_reward": -2.191072702407837,
      "objective/rlhf_reward": -7.283338430340647,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 1.1268585920333862,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.427734375,
      "step": 282,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.002413749694824
    },
    {
      "episode": 4544,
      "epoch": 0.02722555751279194,
      "loss/policy_avg": 0.26003268361091614,
      "lr": 9.819146216768916e-06,
      "objective/entropy": 59.813140869140625,
      "objective/kl": 32.33997344970703,
      "objective/non_score_reward": -1.6169987916946411,
      "objective/rlhf_reward": -4.643166418346476,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 108.00172424316406,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.60546875,
      "step": 283,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.996032953262329
    },
    {
      "episode": 4560,
      "epoch": 0.027321422151921487,
      "loss/policy_avg": 0.06828334182500839,
      "lr": 9.818507157464213e-06,
      "objective/entropy": 164.7733154296875,
      "objective/kl": 36.976539611816406,
      "objective/non_score_reward": -1.8488272428512573,
      "objective/rlhf_reward": -5.791188750330525,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 22.712989807128906,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.498046875,
      "step": 284,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.998927354812622
    },
    {
      "episode": 4576,
      "epoch": 0.027417286791051036,
      "loss/policy_avg": 0.346102774143219,
      "lr": 9.81786809815951e-06,
      "objective/entropy": 141.91213989257812,
      "objective/kl": 29.89690589904785,
      "objective/non_score_reward": -1.4948452711105347,
      "objective/rlhf_reward": -4.5793810248374935,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 4.914261817932129,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.583984375,
      "step": 285,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9991732835769653
    },
    {
      "episode": 4592,
      "epoch": 0.027513151430180585,
      "loss/policy_avg": 0.07111110538244247,
      "lr": 9.817229038854806e-06,
      "objective/entropy": -41.44879150390625,
      "objective/kl": 29.296417236328125,
      "objective/non_score_reward": -1.4648208618164062,
      "objective/rlhf_reward": -4.4806815172113,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 70.16557312011719,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5625,
      "step": 286,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9982048273086548
    },
    {
      "episode": 4608,
      "epoch": 0.027609016069310134,
      "loss/policy_avg": 0.6204440593719482,
      "lr": 9.816589979550103e-06,
      "objective/entropy": 10.609687805175781,
      "objective/kl": 34.5562744140625,
      "objective/non_score_reward": -1.727813720703125,
      "objective/rlhf_reward": -5.552005314563198,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 44.11948776245117,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4677734375,
      "step": 287,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9960062503814697
    },
    {
      "episode": 4624,
      "epoch": 0.027704880708439683,
      "loss/policy_avg": -0.3703474700450897,
      "lr": 9.8159509202454e-06,
      "objective/entropy": 16.20748519897461,
      "objective/kl": 40.348899841308594,
      "objective/non_score_reward": -2.0174450874328613,
      "objective/rlhf_reward": -5.6697804689407345,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 58.94084167480469,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.451171875,
      "step": 288,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.000488519668579
    },
    {
      "episode": 4640,
      "epoch": 0.027800745347569232,
      "loss/policy_avg": 0.691341757774353,
      "lr": 9.815311860940695e-06,
      "objective/entropy": 164.64894104003906,
      "objective/kl": 35.96034240722656,
      "objective/non_score_reward": -1.7980170249938965,
      "objective/rlhf_reward": -2.792067980766296,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 105.621826171875,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.53125,
      "step": 289,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9972904920578003
    },
    {
      "episode": 4656,
      "epoch": 0.02789660998669878,
      "loss/policy_avg": 0.05122673511505127,
      "lr": 9.814672801635992e-06,
      "objective/entropy": 143.17758178710938,
      "objective/kl": 27.651023864746094,
      "objective/non_score_reward": -1.3825511932373047,
      "objective/rlhf_reward": -2.6064857586633887,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 30.806257247924805,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.462890625,
      "step": 290,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.996225118637085
    },
    {
      "episode": 4672,
      "epoch": 0.02799247462582833,
      "loss/policy_avg": -0.021466929465532303,
      "lr": 9.81403374233129e-06,
      "objective/entropy": 123.44010925292969,
      "objective/kl": 18.645748138427734,
      "objective/non_score_reward": -0.9322873950004578,
      "objective/rlhf_reward": -2.403636608153505,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 24.915597915649414,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.638671875,
      "step": 291,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.995213270187378
    },
    {
      "episode": 4688,
      "epoch": 0.02808833926495788,
      "loss/policy_avg": 0.700859785079956,
      "lr": 9.813394683026586e-06,
      "objective/entropy": 58.48292922973633,
      "objective/kl": 28.2305965423584,
      "objective/non_score_reward": -1.411529779434204,
      "objective/rlhf_reward": -4.24611941576004,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 21.04977035522461,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4208984375,
      "step": 292,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9981340169906616
    },
    {
      "episode": 4704,
      "epoch": 0.028184203904087428,
      "loss/policy_avg": 0.9605820775032043,
      "lr": 9.812755623721883e-06,
      "objective/entropy": -33.6519775390625,
      "objective/kl": 33.635501861572266,
      "objective/non_score_reward": -1.6817750930786133,
      "objective/rlhf_reward": -5.065241103590118,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 18.019363403320312,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4619140625,
      "step": 293,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.000504493713379
    },
    {
      "episode": 4720,
      "epoch": 0.028280068543216977,
      "loss/policy_avg": 0.44443511962890625,
      "lr": 9.81211656441718e-06,
      "objective/entropy": 61.81305694580078,
      "objective/kl": 37.54548263549805,
      "objective/non_score_reward": -1.8772742748260498,
      "objective/rlhf_reward": -6.130494453994137,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 34.736690521240234,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.701171875,
      "step": 294,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9981006383895874
    },
    {
      "episode": 4736,
      "epoch": 0.028375933182346526,
      "loss/policy_avg": -0.004817202687263489,
      "lr": 9.811477505112475e-06,
      "objective/entropy": -85.25079345703125,
      "objective/kl": 22.125272750854492,
      "objective/non_score_reward": -1.1062637567520142,
      "objective/rlhf_reward": -3.0658050415262412,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 39.945377349853516,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.619140625,
      "step": 295,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.001112937927246
    },
    {
      "episode": 4752,
      "epoch": 0.028471797821476075,
      "loss/policy_avg": -0.018911486491560936,
      "lr": 9.810838445807772e-06,
      "objective/entropy": 187.50953674316406,
      "objective/kl": 31.752737045288086,
      "objective/non_score_reward": -1.587636947631836,
      "objective/rlhf_reward": -4.525718684467386,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 41.095298767089844,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.560546875,
      "step": 296,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.0318245887756348
    },
    {
      "episode": 4768,
      "epoch": 0.028567662460605624,
      "loss/policy_avg": 0.5813855528831482,
      "lr": 9.810199386503069e-06,
      "objective/entropy": 13.395767211914062,
      "objective/kl": 29.76428985595703,
      "objective/non_score_reward": -1.4882144927978516,
      "objective/rlhf_reward": -4.219524757067362,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 58.40808868408203,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.62109375,
      "step": 297,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9971027374267578
    },
    {
      "episode": 4784,
      "epoch": 0.028663527099735173,
      "loss/policy_avg": 0.25174012780189514,
      "lr": 9.809560327198366e-06,
      "objective/entropy": 93.99857330322266,
      "objective/kl": 31.07823944091797,
      "objective/non_score_reward": -1.5539120435714722,
      "objective/rlhf_reward": -4.482314721743266,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 56.219329833984375,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.767578125,
      "step": 298,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9973026514053345
    },
    {
      "episode": 4800,
      "epoch": 0.02875939173886472,
      "loss/policy_avg": -0.05966740474104881,
      "lr": 9.808921267893663e-06,
      "objective/entropy": 199.3701934814453,
      "objective/kl": 26.15532684326172,
      "objective/non_score_reward": -1.3077664375305176,
      "objective/rlhf_reward": -3.7152936098896827,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 18.272422790527344,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.650390625,
      "step": 299,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.002091646194458
    },
    {
      "episode": 4816,
      "epoch": 0.02885525637799427,
      "loss/policy_avg": 0.19725301861763,
      "lr": 9.808282208588958e-06,
      "objective/entropy": 112.11613464355469,
      "objective/kl": 33.344722747802734,
      "objective/non_score_reward": -1.667236089706421,
      "objective/rlhf_reward": -6.668944478034973,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 29.54242706298828,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.640625,
      "step": 300,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0038766860961914
    },
    {
      "episode": 4832,
      "epoch": 0.02895112101712382,
      "loss/policy_avg": -0.17506346106529236,
      "lr": 9.807643149284255e-06,
      "objective/entropy": 70.48281860351562,
      "objective/kl": 29.51511573791504,
      "objective/non_score_reward": -1.4757558107376099,
      "objective/rlhf_reward": -4.387251400741276,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 12.791141510009766,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4814453125,
      "step": 301,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.999895691871643
    },
    {
      "episode": 4848,
      "epoch": 0.029046985656253372,
      "loss/policy_avg": 0.38140204548835754,
      "lr": 9.80700408997955e-06,
      "objective/entropy": 23.643152236938477,
      "objective/kl": 27.579925537109375,
      "objective/non_score_reward": -1.3789963722229004,
      "objective/rlhf_reward": -3.854125951946364,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 8.89024543762207,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.58203125,
      "step": 302,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9984774589538574
    },
    {
      "episode": 4864,
      "epoch": 0.02914285029538292,
      "loss/policy_avg": 0.18466374278068542,
      "lr": 9.806365030674847e-06,
      "objective/entropy": -30.63671875,
      "objective/kl": 25.678733825683594,
      "objective/non_score_reward": -1.2839367389678955,
      "objective/rlhf_reward": -3.6199750540577735,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 4.08036470413208,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.53515625,
      "step": 303,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.999252200126648
    },
    {
      "episode": 4880,
      "epoch": 0.02923871493451247,
      "loss/policy_avg": 0.20352232456207275,
      "lr": 9.805725971370144e-06,
      "objective/entropy": -14.465229034423828,
      "objective/kl": 16.88151741027832,
      "objective/non_score_reward": -0.8440757989883423,
      "objective/rlhf_reward": 1.023696751892567,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 16.945369720458984,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.51171875,
      "step": 304,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.997638463973999
    },
    {
      "episode": 4896,
      "epoch": 0.02933457957364202,
      "loss/policy_avg": 0.36892420053482056,
      "lr": 9.80508691206544e-06,
      "objective/entropy": 136.53363037109375,
      "objective/kl": 30.262548446655273,
      "objective/non_score_reward": -1.513127326965332,
      "objective/rlhf_reward": -3.1287905319940776,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 30.166175842285156,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.615234375,
      "step": 305,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0001025199890137
    },
    {
      "episode": 4912,
      "epoch": 0.029430444212771568,
      "loss/policy_avg": 0.07577557861804962,
      "lr": 9.804447852760737e-06,
      "objective/entropy": 77.17935943603516,
      "objective/kl": 28.32352638244629,
      "objective/non_score_reward": -1.4161763191223145,
      "objective/rlhf_reward": -4.148933493884739,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 2.6957955360412598,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.63671875,
      "step": 306,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0025148391723633
    },
    {
      "episode": 4928,
      "epoch": 0.029526308851901117,
      "loss/policy_avg": 0.1559610664844513,
      "lr": 9.803808793456034e-06,
      "objective/entropy": -16.938400268554688,
      "objective/kl": 21.827743530273438,
      "objective/non_score_reward": -1.091387152671814,
      "objective/rlhf_reward": -2.703689043939696,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 7.885660171508789,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.431640625,
      "step": 307,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0027849674224854
    },
    {
      "episode": 4944,
      "epoch": 0.029622173491030666,
      "loss/policy_avg": -0.17305535078048706,
      "lr": 9.80316973415133e-06,
      "objective/entropy": -31.412694931030273,
      "objective/kl": 23.805431365966797,
      "objective/non_score_reward": -1.1902716159820557,
      "objective/rlhf_reward": -3.1569663322606853,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 35.29633331298828,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.654296875,
      "step": 308,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0007619857788086
    },
    {
      "episode": 4960,
      "epoch": 0.029718038130160215,
      "loss/policy_avg": 0.13406828045845032,
      "lr": 9.802530674846626e-06,
      "objective/entropy": 68.0604248046875,
      "objective/kl": 31.641517639160156,
      "objective/non_score_reward": -1.582075834274292,
      "objective/rlhf_reward": -4.949701407042843,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 32.652069091796875,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.3017578125,
      "step": 309,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9981597661972046
    },
    {
      "episode": 4976,
      "epoch": 0.029813902769289764,
      "loss/policy_avg": 0.3640270233154297,
      "lr": 9.801891615541923e-06,
      "objective/entropy": 73.73117065429688,
      "objective/kl": 22.181957244873047,
      "objective/non_score_reward": -1.109097957611084,
      "objective/rlhf_reward": -4.436391651630402,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 24.474929809570312,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.77734375,
      "step": 310,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9988360404968262
    },
    {
      "episode": 4992,
      "epoch": 0.029909767408419313,
      "loss/policy_avg": 0.598778486251831,
      "lr": 9.80125255623722e-06,
      "objective/entropy": 77.45819854736328,
      "objective/kl": 31.91500473022461,
      "objective/non_score_reward": -1.5957502126693726,
      "objective/rlhf_reward": -4.558172132047723,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 7.392116546630859,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.595703125,
      "step": 311,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9976756572723389
    },
    {
      "episode": 5008,
      "epoch": 0.03000563204754886,
      "loss/policy_avg": -0.14829277992248535,
      "lr": 9.800613496932517e-06,
      "objective/entropy": 73.91107940673828,
      "objective/kl": 22.043235778808594,
      "objective/non_score_reward": -1.1021617650985718,
      "objective/rlhf_reward": -3.0086471796035763,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 7.375496864318848,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.548828125,
      "step": 312,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0021305084228516
    },
    {
      "episode": 5024,
      "epoch": 0.03010149668667841,
      "loss/policy_avg": 0.34449532628059387,
      "lr": 9.799974437627812e-06,
      "objective/entropy": 27.04425048828125,
      "objective/kl": 31.98007583618164,
      "objective/non_score_reward": -1.599003791809082,
      "objective/rlhf_reward": -4.996015524864196,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 53.630210876464844,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.529296875,
      "step": 313,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9990134239196777
    },
    {
      "episode": 5040,
      "epoch": 0.03019736132580796,
      "loss/policy_avg": 0.029857225716114044,
      "lr": 9.799335378323109e-06,
      "objective/entropy": 147.96096801757812,
      "objective/kl": 27.342838287353516,
      "objective/non_score_reward": -1.3671419620513916,
      "objective/rlhf_reward": -4.017969946475372,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 18.108400344848633,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.638671875,
      "step": 314,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9988747835159302
    },
    {
      "episode": 5056,
      "epoch": 0.03029322596493751,
      "loss/policy_avg": 0.05283927917480469,
      "lr": 9.798696319018406e-06,
      "objective/entropy": -46.846099853515625,
      "objective/kl": 30.715242385864258,
      "objective/non_score_reward": -1.535762071609497,
      "objective/rlhf_reward": -4.538928542200642,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 66.26033020019531,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.6484375,
      "step": 315,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9992979764938354
    },
    {
      "episode": 5072,
      "epoch": 0.030389090604067057,
      "loss/policy_avg": 0.2858242094516754,
      "lr": 9.798057259713703e-06,
      "objective/entropy": -156.9435577392578,
      "objective/kl": 31.284622192382812,
      "objective/non_score_reward": -1.5642311573028564,
      "objective/rlhf_reward": -4.915288856535583,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 74.38943481445312,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.7421875,
      "step": 316,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9992833137512207
    },
    {
      "episode": 5088,
      "epoch": 0.030484955243196606,
      "loss/policy_avg": 0.28274843096733093,
      "lr": 9.797418200409e-06,
      "objective/entropy": -214.69573974609375,
      "objective/kl": 22.27606201171875,
      "objective/non_score_reward": -1.1138031482696533,
      "objective/rlhf_reward": -2.3325063607850414,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 35.48945236206055,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.505859375,
      "step": 317,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9962687492370605
    },
    {
      "episode": 5104,
      "epoch": 0.030580819882326155,
      "loss/policy_avg": -0.08736838400363922,
      "lr": 9.796779141104296e-06,
      "objective/entropy": -18.148971557617188,
      "objective/kl": 27.546077728271484,
      "objective/non_score_reward": -1.377303957939148,
      "objective/rlhf_reward": -4.1306134844697535,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 76.84832000732422,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.6171875,
      "step": 318,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0014290809631348
    },
    {
      "episode": 5120,
      "epoch": 0.030676684521455704,
      "loss/policy_avg": 0.031098078936338425,
      "lr": 9.796140081799592e-06,
      "objective/entropy": 103.30211639404297,
      "objective/kl": 27.747032165527344,
      "objective/non_score_reward": -1.3873515129089355,
      "objective/rlhf_reward": -4.033634447845158,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 54.69970703125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.65234375,
      "step": 319,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9985463619232178
    },
    {
      "episode": 5136,
      "epoch": 0.030772549160585253,
      "loss/policy_avg": 0.3622899651527405,
      "lr": 9.795501022494888e-06,
      "objective/entropy": 66.0567398071289,
      "objective/kl": 26.39444351196289,
      "objective/non_score_reward": -1.3197221755981445,
      "objective/rlhf_reward": -3.7226295759349615,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 5.640605449676514,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6484375,
      "step": 320,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9992847442626953
    },
    {
      "episode": 5152,
      "epoch": 0.030868413799714802,
      "loss/policy_avg": -0.10469883680343628,
      "lr": 9.794861963190185e-06,
      "objective/entropy": 35.81920623779297,
      "objective/kl": 25.668739318847656,
      "objective/non_score_reward": -1.2834370136260986,
      "objective/rlhf_reward": -3.7337480843067166,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 5.808808326721191,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.6484375,
      "step": 321,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.999007225036621
    },
    {
      "episode": 5168,
      "epoch": 0.03096427843884435,
      "loss/policy_avg": -0.2741212248802185,
      "lr": 9.794222903885482e-06,
      "objective/entropy": 52.38888168334961,
      "objective/kl": 34.969974517822266,
      "objective/non_score_reward": -1.748498797416687,
      "objective/rlhf_reward": -5.652359655409484,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 8.913843154907227,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.3798828125,
      "step": 322,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0103840827941895
    },
    {
      "episode": 5184,
      "epoch": 0.0310601430779739,
      "loss/policy_avg": 0.30122414231300354,
      "lr": 9.793583844580777e-06,
      "objective/entropy": 134.16075134277344,
      "objective/kl": 25.608116149902344,
      "objective/non_score_reward": -1.280405879020691,
      "objective/rlhf_reward": -3.1742123318480804,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 74.33633422851562,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.603515625,
      "step": 323,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.014462471008301
    },
    {
      "episode": 5200,
      "epoch": 0.03115600771710345,
      "loss/policy_avg": 0.26204991340637207,
      "lr": 9.792944785276074e-06,
      "objective/entropy": 2.559833526611328,
      "objective/kl": 25.519519805908203,
      "objective/non_score_reward": -1.2759759426116943,
      "objective/rlhf_reward": -3.74465426180212,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 92.09954071044922,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.57421875,
      "step": 324,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.998145580291748
    },
    {
      "episode": 5216,
      "epoch": 0.031251872356233,
      "loss/policy_avg": 0.18864840269088745,
      "lr": 9.792305725971371e-06,
      "objective/entropy": 48.99184036254883,
      "objective/kl": 28.022377014160156,
      "objective/non_score_reward": -1.4011187553405762,
      "objective/rlhf_reward": -4.123522403653025,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 22.120746612548828,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.60546875,
      "step": 325,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9984712600708008
    },
    {
      "episode": 5232,
      "epoch": 0.03134773699536255,
      "loss/policy_avg": 0.42162489891052246,
      "lr": 9.791666666666666e-06,
      "objective/entropy": -129.23065185546875,
      "objective/kl": 31.687660217285156,
      "objective/non_score_reward": -1.5843830108642578,
      "objective/rlhf_reward": -4.821760052236256,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 111.98194885253906,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.587890625,
      "step": 326,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.996734857559204
    },
    {
      "episode": 5248,
      "epoch": 0.0314436016344921,
      "loss/policy_avg": -0.07900102436542511,
      "lr": 9.791027607361963e-06,
      "objective/entropy": 31.351696014404297,
      "objective/kl": 27.038206100463867,
      "objective/non_score_reward": -1.3519103527069092,
      "objective/rlhf_reward": -5.407641291618347,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 9.7061767578125,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.4423828125,
      "step": 327,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0043118000030518
    },
    {
      "episode": 5264,
      "epoch": 0.03153946627362165,
      "loss/policy_avg": 0.16587843000888824,
      "lr": 9.79038854805726e-06,
      "objective/entropy": 143.86651611328125,
      "objective/kl": 27.42593765258789,
      "objective/non_score_reward": -1.3712968826293945,
      "objective/rlhf_reward": -4.125937962268276,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 119.49800872802734,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.767578125,
      "step": 328,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.999497890472412
    },
    {
      "episode": 5280,
      "epoch": 0.0316353309127512,
      "loss/policy_avg": 0.29106539487838745,
      "lr": 9.789749488752557e-06,
      "objective/entropy": 67.8651351928711,
      "objective/kl": 32.114479064941406,
      "objective/non_score_reward": -1.6057239770889282,
      "objective/rlhf_reward": -5.08126013567987,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 7.976801872253418,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.470703125,
      "step": 329,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.0011985301971436
    },
    {
      "episode": 5296,
      "epoch": 0.031731195551880746,
      "loss/policy_avg": 0.5780457258224487,
      "lr": 9.789110429447854e-06,
      "objective/entropy": 104.15371704101562,
      "objective/kl": 30.92220687866211,
      "objective/non_score_reward": -1.5461102724075317,
      "objective/rlhf_reward": -3.784441030025482,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 52.566375732421875,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.55859375,
      "step": 330,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9993044137954712
    },
    {
      "episode": 5312,
      "epoch": 0.031827060191010295,
      "loss/policy_avg": 0.24728742241859436,
      "lr": 9.78847137014315e-06,
      "objective/entropy": -95.75634765625,
      "objective/kl": 30.755779266357422,
      "objective/non_score_reward": -1.5377888679504395,
      "objective/rlhf_reward": -4.791905486319942,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 32.567970275878906,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.666015625,
      "step": 331,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9991464614868164
    },
    {
      "episode": 5328,
      "epoch": 0.031922924830139844,
      "loss/policy_avg": 1.9531396627426147,
      "lr": 9.787832310838446e-06,
      "objective/entropy": 18.057151794433594,
      "objective/kl": 21.966590881347656,
      "objective/non_score_reward": -1.0983295440673828,
      "objective/rlhf_reward": -2.993318116664886,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 11.555295944213867,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.716796875,
      "step": 332,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.004971981048584
    },
    {
      "episode": 5344,
      "epoch": 0.03201878946926939,
      "loss/policy_avg": 0.0304682869464159,
      "lr": 9.787193251533743e-06,
      "objective/entropy": -100.86114501953125,
      "objective/kl": 21.19540023803711,
      "objective/non_score_reward": -1.0597699880599976,
      "objective/rlhf_reward": -2.8604777837670863,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 36.17786407470703,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.46875,
      "step": 333,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.997187852859497
    },
    {
      "episode": 5360,
      "epoch": 0.03211465410839894,
      "loss/policy_avg": 0.2974792718887329,
      "lr": 9.78655419222904e-06,
      "objective/entropy": 59.0064697265625,
      "objective/kl": 23.83527183532715,
      "objective/non_score_reward": -1.1917636394500732,
      "objective/rlhf_reward": -3.2861017016724343,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 27.08124542236328,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.76171875,
      "step": 334,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0005297660827637
    },
    {
      "episode": 5376,
      "epoch": 0.03221051874752849,
      "loss/policy_avg": 0.20310130715370178,
      "lr": 9.785915132924337e-06,
      "objective/entropy": 51.579200744628906,
      "objective/kl": 26.064043045043945,
      "objective/non_score_reward": -1.3032021522521973,
      "objective/rlhf_reward": -3.656549363341883,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 3.1224026679992676,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.51171875,
      "step": 335,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0020689964294434
    },
    {
      "episode": 5392,
      "epoch": 0.03230638338665804,
      "loss/policy_avg": -0.22360196709632874,
      "lr": 9.785276073619633e-06,
      "objective/entropy": 8.019195556640625,
      "objective/kl": 34.267356872558594,
      "objective/non_score_reward": -1.7133680582046509,
      "objective/rlhf_reward": -5.40287409266983,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 27.402694702148438,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.517578125,
      "step": 336,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.99736487865448
    },
    {
      "episode": 5408,
      "epoch": 0.03240224802578759,
      "loss/policy_avg": 0.394004225730896,
      "lr": 9.784637014314929e-06,
      "objective/entropy": -7.316375732421875,
      "objective/kl": 34.60337829589844,
      "objective/non_score_reward": -1.7301688194274902,
      "objective/rlhf_reward": -3.9969565018427105,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 60.58606719970703,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.609375,
      "step": 337,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9990360736846924
    },
    {
      "episode": 5424,
      "epoch": 0.03249811266491714,
      "loss/policy_avg": 0.08118537068367004,
      "lr": 9.783997955010226e-06,
      "objective/entropy": 3.808826446533203,
      "objective/kl": 33.9757080078125,
      "objective/non_score_reward": -1.6987853050231934,
      "objective/rlhf_reward": -5.3713093592720895,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 49.47349548339844,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.546875,
      "step": 338,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9974095821380615
    },
    {
      "episode": 5440,
      "epoch": 0.03259397730404669,
      "loss/policy_avg": 0.1250596046447754,
      "lr": 9.783358895705522e-06,
      "objective/entropy": -42.7471809387207,
      "objective/kl": 27.222618103027344,
      "objective/non_score_reward": -1.361130952835083,
      "objective/rlhf_reward": -3.9287524459683265,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 8.669515609741211,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.509765625,
      "step": 339,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0002639293670654
    },
    {
      "episode": 5456,
      "epoch": 0.032689841943176236,
      "loss/policy_avg": 1.2977867126464844,
      "lr": 9.78271983640082e-06,
      "objective/entropy": -60.51675796508789,
      "objective/kl": 27.726932525634766,
      "objective/non_score_reward": -1.3863465785980225,
      "objective/rlhf_reward": -4.064433994706034,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 52.59510803222656,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4345703125,
      "step": 340,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9984922409057617
    },
    {
      "episode": 5472,
      "epoch": 0.032785706582305785,
      "loss/policy_avg": 0.10771232098340988,
      "lr": 9.782080777096116e-06,
      "objective/entropy": 39.22501754760742,
      "objective/kl": 38.581573486328125,
      "objective/non_score_reward": -1.9290788173675537,
      "objective/rlhf_reward": -6.374679616003662,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 16.336502075195312,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.599609375,
      "step": 341,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9990124702453613
    },
    {
      "episode": 5488,
      "epoch": 0.032881571221435334,
      "loss/policy_avg": 0.029969744384288788,
      "lr": 9.781441717791413e-06,
      "objective/entropy": 54.763675689697266,
      "objective/kl": 27.586057662963867,
      "objective/non_score_reward": -1.379302978515625,
      "objective/rlhf_reward": -3.7838785807291666,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 29.997591018676758,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4462890625,
      "step": 342,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9992620944976807
    },
    {
      "episode": 5504,
      "epoch": 0.03297743586056488,
      "loss/policy_avg": -0.003006638027727604,
      "lr": 9.780802658486708e-06,
      "objective/entropy": 4.6327056884765625,
      "objective/kl": 25.01122283935547,
      "objective/non_score_reward": -1.250560998916626,
      "objective/rlhf_reward": -3.054833005146916,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 2.332850694656372,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.51171875,
      "step": 343,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0042150020599365
    },
    {
      "episode": 5520,
      "epoch": 0.03307330049969443,
      "loss/policy_avg": -0.2595655918121338,
      "lr": 9.780163599182005e-06,
      "objective/entropy": -9.382579803466797,
      "objective/kl": 25.310394287109375,
      "objective/non_score_reward": -1.2655197381973267,
      "objective/rlhf_reward": -3.611480812640533,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 35.86376190185547,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.708984375,
      "step": 344,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9991991519927979
    },
    {
      "episode": 5536,
      "epoch": 0.03316916513882398,
      "loss/policy_avg": 1.6723182201385498,
      "lr": 9.7795245398773e-06,
      "objective/entropy": 167.249267578125,
      "objective/kl": 38.30883026123047,
      "objective/non_score_reward": -1.915441632270813,
      "objective/rlhf_reward": -6.283164360610348,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 27.648231506347656,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.521484375,
      "step": 345,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 1.9966659545898438
    },
    {
      "episode": 5552,
      "epoch": 0.03326502977795353,
      "loss/policy_avg": 0.21136921644210815,
      "lr": 9.778885480572597e-06,
      "objective/entropy": 202.48263549804688,
      "objective/kl": 28.62633514404297,
      "objective/non_score_reward": -1.4313167333602905,
      "objective/rlhf_reward": -4.169007628169611,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 28.591995239257812,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7421875,
      "step": 346,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9962763786315918
    },
    {
      "episode": 5568,
      "epoch": 0.03336089441708308,
      "loss/policy_avg": 0.030091844499111176,
      "lr": 9.778246421267894e-06,
      "objective/entropy": 178.1235809326172,
      "objective/kl": 37.731300354003906,
      "objective/non_score_reward": -1.8865652084350586,
      "objective/rlhf_reward": -5.990001528468683,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 17.381601333618164,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.52734375,
      "step": 347,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.001326322555542
    },
    {
      "episode": 5584,
      "epoch": 0.03345675905621263,
      "loss/policy_avg": 0.40717682242393494,
      "lr": 9.777607361963191e-06,
      "objective/entropy": 90.73904418945312,
      "objective/kl": 31.88462257385254,
      "objective/non_score_reward": -1.594231128692627,
      "objective/rlhf_reward": -5.0176747677072715,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 37.96768569946289,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5703125,
      "step": 348,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9991170167922974
    },
    {
      "episode": 5600,
      "epoch": 0.033552623695342176,
      "loss/policy_avg": 0.5422201156616211,
      "lr": 9.776968302658488e-06,
      "objective/entropy": 80.41102600097656,
      "objective/kl": 34.64447021484375,
      "objective/non_score_reward": -1.7322235107421875,
      "objective/rlhf_reward": -5.478295783610687,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 117.23408508300781,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.54296875,
      "step": 349,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9983158111572266
    },
    {
      "episode": 5616,
      "epoch": 0.033648488334471725,
      "loss/policy_avg": 0.3756037950515747,
      "lr": 9.776329243353783e-06,
      "objective/entropy": 61.65838623046875,
      "objective/kl": 44.269325256347656,
      "objective/non_score_reward": -2.213466167449951,
      "objective/rlhf_reward": -7.40326676806961,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 19.3502254486084,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.583984375,
      "step": 350,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9988865852355957
    },
    {
      "episode": 5632,
      "epoch": 0.033744352973601274,
      "loss/policy_avg": 0.9775654673576355,
      "lr": 9.77569018404908e-06,
      "objective/entropy": 57.90337371826172,
      "objective/kl": 41.80830383300781,
      "objective/non_score_reward": -2.0904150009155273,
      "objective/rlhf_reward": -6.628326908747354,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 84.0235824584961,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.84765625,
      "step": 351,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9960914850234985
    },
    {
      "episode": 5648,
      "epoch": 0.03384021761273082,
      "loss/policy_avg": -0.20816992223262787,
      "lr": 9.775051124744377e-06,
      "objective/entropy": -118.41542053222656,
      "objective/kl": 23.201061248779297,
      "objective/non_score_reward": -1.160053014755249,
      "objective/rlhf_reward": -2.8153834894028416,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 4.062729835510254,
      "policy/clipfrac_avg": 2.0,
      "policy/entropy_avg": 0.4453125,
      "step": 352,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0035219192504883
    },
    {
      "episode": 5664,
      "epoch": 0.03393608225186037,
      "loss/policy_avg": 0.035901207476854324,
      "lr": 9.774412065439674e-06,
      "objective/entropy": 154.33920288085938,
      "objective/kl": 28.773828506469727,
      "objective/non_score_reward": -1.4386913776397705,
      "objective/rlhf_reward": -2.8310468539011207,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 25.329944610595703,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4755859375,
      "step": 353,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 0,
      "val/ratio": 2.0003747940063477
    },
    {
      "episode": 5680,
      "epoch": 0.03403194689098992,
      "loss/policy_avg": 0.7185342311859131,
      "lr": 9.77377300613497e-06,
      "objective/entropy": 45.80010986328125,
      "objective/kl": 35.51177215576172,
      "objective/non_score_reward": -1.7755887508392334,
      "objective/rlhf_reward": -5.586583339961704,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 69.95939636230469,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.51953125,
      "step": 354,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.996915340423584
    },
    {
      "episode": 5696,
      "epoch": 0.03412781153011947,
      "loss/policy_avg": 0.871320903301239,
      "lr": 9.773133946830267e-06,
      "objective/entropy": 136.34942626953125,
      "objective/kl": 37.25979995727539,
      "objective/non_score_reward": -1.862990140914917,
      "objective/rlhf_reward": -5.504549334721501,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 58.879180908203125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.66796875,
      "step": 355,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9990177154541016
    },
    {
      "episode": 5712,
      "epoch": 0.03422367616924902,
      "loss/policy_avg": 0.14556461572647095,
      "lr": 9.772494887525563e-06,
      "objective/entropy": -10.28516960144043,
      "objective/kl": 29.231609344482422,
      "objective/non_score_reward": -1.461580514907837,
      "objective/rlhf_reward": -4.021493013176035,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 39.2762451171875,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5859375,
      "step": 356,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9985218048095703
    },
    {
      "episode": 5728,
      "epoch": 0.03431954080837857,
      "loss/policy_avg": 0.27659082412719727,
      "lr": 9.77185582822086e-06,
      "objective/entropy": -36.31108093261719,
      "objective/kl": 32.386661529541016,
      "objective/non_score_reward": -1.619333028793335,
      "objective/rlhf_reward": -6.47733199596405,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 10.265704154968262,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.71484375,
      "step": 357,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9992687702178955
    },
    {
      "episode": 5744,
      "epoch": 0.03441540544750812,
      "loss/policy_avg": 0.10546956956386566,
      "lr": 9.771216768916156e-06,
      "objective/entropy": 79.19872283935547,
      "objective/kl": 22.353626251220703,
      "objective/non_score_reward": -1.1176813840866089,
      "objective/rlhf_reward": -3.0201275154069513,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 20.73809051513672,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4873046875,
      "step": 358,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9974703788757324
    },
    {
      "episode": 5760,
      "epoch": 0.034511270086637666,
      "loss/policy_avg": 0.5648351311683655,
      "lr": 9.770577709611453e-06,
      "objective/entropy": 38.47356033325195,
      "objective/kl": 23.87390899658203,
      "objective/non_score_reward": -1.1936955451965332,
      "objective/rlhf_reward": -3.4331463485056455,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 8.14659595489502,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.53125,
      "step": 359,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0011978149414062
    },
    {
      "episode": 5776,
      "epoch": 0.034607134725767215,
      "loss/policy_avg": 0.5912380814552307,
      "lr": 9.76993865030675e-06,
      "objective/entropy": 116.97152709960938,
      "objective/kl": 40.231689453125,
      "objective/non_score_reward": -2.011584758758545,
      "objective/rlhf_reward": -6.565385702069163,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 117.33955383300781,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.666015625,
      "step": 360,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9969582557678223
    },
    {
      "episode": 5792,
      "epoch": 0.034702999364896764,
      "loss/policy_avg": -0.019477106630802155,
      "lr": 9.769299591002045e-06,
      "objective/entropy": -144.96791076660156,
      "objective/kl": 27.773448944091797,
      "objective/non_score_reward": -1.3886725902557373,
      "objective/rlhf_reward": -5.554690062999725,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 7.48216438293457,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.626953125,
      "step": 361,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000136137008667
    },
    {
      "episode": 5808,
      "epoch": 0.03479886400402631,
      "loss/policy_avg": -0.5155759453773499,
      "lr": 9.768660531697342e-06,
      "objective/entropy": 78.00074768066406,
      "objective/kl": 34.501590728759766,
      "objective/non_score_reward": -1.7250795364379883,
      "objective/rlhf_reward": -5.521715917674404,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 122.40145874023438,
      "policy/clipfrac_avg": 2.0,
      "policy/entropy_avg": 0.513671875,
      "step": 362,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.02427077293396
    },
    {
      "episode": 5824,
      "epoch": 0.03489472864315586,
      "loss/policy_avg": 0.3520805537700653,
      "lr": 9.768021472392639e-06,
      "objective/entropy": -66.29779815673828,
      "objective/kl": 23.767650604248047,
      "objective/non_score_reward": -1.188382625579834,
      "objective/rlhf_reward": -3.3535303235054013,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 66.86349487304688,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.51953125,
      "step": 363,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9973565340042114
    },
    {
      "episode": 5840,
      "epoch": 0.03499059328228541,
      "loss/policy_avg": 0.25808075070381165,
      "lr": 9.767382413087936e-06,
      "objective/entropy": 55.69321060180664,
      "objective/kl": 32.73713684082031,
      "objective/non_score_reward": -1.6368569135665894,
      "objective/rlhf_reward": -4.147427594661712,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 17.00968360900879,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.70703125,
      "step": 364,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.998908519744873
    },
    {
      "episode": 5856,
      "epoch": 0.03508645792141496,
      "loss/policy_avg": -0.33678027987480164,
      "lr": 9.766743353783233e-06,
      "objective/entropy": 63.459205627441406,
      "objective/kl": 36.74503707885742,
      "objective/non_score_reward": -1.837251901626587,
      "objective/rlhf_reward": -5.226301344410453,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 63.5507926940918,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.7734375,
      "step": 365,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0005531311035156
    },
    {
      "episode": 5872,
      "epoch": 0.03518232256054451,
      "loss/policy_avg": 0.397920161485672,
      "lr": 9.76610429447853e-06,
      "objective/entropy": -11.37314224243164,
      "objective/kl": 32.99299240112305,
      "objective/non_score_reward": -1.6496496200561523,
      "objective/rlhf_reward": -5.174766202171413,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 28.19782257080078,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.65625,
      "step": 366,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9984650611877441
    },
    {
      "episode": 5888,
      "epoch": 0.03527818719967406,
      "loss/policy_avg": 0.5101684331893921,
      "lr": 9.765465235173825e-06,
      "objective/entropy": 122.12913513183594,
      "objective/kl": 39.20099639892578,
      "objective/non_score_reward": -1.9600497484207153,
      "objective/rlhf_reward": -6.480949008201046,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 10.180255889892578,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.45703125,
      "step": 367,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9976162910461426
    },
    {
      "episode": 5904,
      "epoch": 0.03537405183880361,
      "loss/policy_avg": -0.46757811307907104,
      "lr": 9.764826175869122e-06,
      "objective/entropy": -108.47764587402344,
      "objective/kl": 25.862443923950195,
      "objective/non_score_reward": -1.2931220531463623,
      "objective/rlhf_reward": -3.6162289073138982,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 2.3750016689300537,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.703125,
      "step": 368,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0032150745391846
    },
    {
      "episode": 5920,
      "epoch": 0.035469916477933155,
      "loss/policy_avg": 0.12928390502929688,
      "lr": 9.764187116564417e-06,
      "objective/entropy": 47.25078201293945,
      "objective/kl": 23.20449447631836,
      "objective/non_score_reward": -1.1602246761322021,
      "objective/rlhf_reward": -2.240898942947388,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 2.1992838382720947,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.544921875,
      "step": 369,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0065484046936035
    },
    {
      "episode": 5936,
      "epoch": 0.035565781117062704,
      "loss/policy_avg": 0.15939241647720337,
      "lr": 9.763548057259714e-06,
      "objective/entropy": -19.609264373779297,
      "objective/kl": 28.25977325439453,
      "objective/non_score_reward": -1.4129884243011475,
      "objective/rlhf_reward": -4.273351618138653,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 59.99807357788086,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.66796875,
      "step": 370,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0036120414733887
    },
    {
      "episode": 5952,
      "epoch": 0.03566164575619225,
      "loss/policy_avg": 0.1767190843820572,
      "lr": 9.76290899795501e-06,
      "objective/entropy": -11.536600112915039,
      "objective/kl": 36.28870391845703,
      "objective/non_score_reward": -1.8144354820251465,
      "objective/rlhf_reward": -7.257741451263428,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 11.846475601196289,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.611328125,
      "step": 371,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.99981689453125
    },
    {
      "episode": 5968,
      "epoch": 0.0357575103953218,
      "loss/policy_avg": 0.3314260244369507,
      "lr": 9.762269938650308e-06,
      "objective/entropy": -30.279476165771484,
      "objective/kl": 28.756494522094727,
      "objective/non_score_reward": -1.4378247261047363,
      "objective/rlhf_reward": -4.300700943084106,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 39.342529296875,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.623046875,
      "step": 372,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.998626708984375
    },
    {
      "episode": 5984,
      "epoch": 0.03585337503445135,
      "loss/policy_avg": 0.18494009971618652,
      "lr": 9.761630879345604e-06,
      "objective/entropy": 68.65098571777344,
      "objective/kl": 36.555747985839844,
      "objective/non_score_reward": -1.8277872800827026,
      "objective/rlhf_reward": -5.486320610317301,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 10.346623420715332,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.517578125,
      "step": 373,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.000062942504883
    },
    {
      "episode": 6000,
      "epoch": 0.03594923967358091,
      "loss/policy_avg": -0.05254024267196655,
      "lr": 9.7609918200409e-06,
      "objective/entropy": -30.816913604736328,
      "objective/kl": 26.80430793762207,
      "objective/non_score_reward": -1.3402154445648193,
      "objective/rlhf_reward": -3.845090114864048,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 3.3415722846984863,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.583984375,
      "step": 374,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9991049766540527
    },
    {
      "episode": 6016,
      "epoch": 0.036045104312710456,
      "loss/policy_avg": 0.48673489689826965,
      "lr": 9.760352760736196e-06,
      "objective/entropy": -54.172760009765625,
      "objective/kl": 26.726612091064453,
      "objective/non_score_reward": -1.3363306522369385,
      "objective/rlhf_reward": -0.945322489738464,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 36.357444763183594,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.708984375,
      "step": 375,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.999312400817871
    },
    {
      "episode": 6032,
      "epoch": 0.036140968951840005,
      "loss/policy_avg": -0.06733483076095581,
      "lr": 9.759713701431493e-06,
      "objective/entropy": 135.20721435546875,
      "objective/kl": 37.13209915161133,
      "objective/non_score_reward": -1.856605052947998,
      "objective/rlhf_reward": -4.5027009590875835,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 17.97521209716797,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4873046875,
      "step": 376,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.002077341079712
    },
    {
      "episode": 6048,
      "epoch": 0.036236833590969554,
      "loss/policy_avg": -0.041654448956251144,
      "lr": 9.75907464212679e-06,
      "objective/entropy": -167.12548828125,
      "objective/kl": 25.773399353027344,
      "objective/non_score_reward": -1.2886700630187988,
      "objective/rlhf_reward": -0.7546801328659054,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 0.800922691822052,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.521484375,
      "step": 377,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.000338315963745
    },
    {
      "episode": 6064,
      "epoch": 0.0363326982300991,
      "loss/policy_avg": 0.03024141490459442,
      "lr": 9.758435582822087e-06,
      "objective/entropy": -73.82417297363281,
      "objective/kl": 26.33017349243164,
      "objective/non_score_reward": -1.3165086507797241,
      "objective/rlhf_reward": -3.14332831122068,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 23.01593780517578,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.63671875,
      "step": 378,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.002521276473999
    },
    {
      "episode": 6080,
      "epoch": 0.03642856286922865,
      "loss/policy_avg": 0.285569429397583,
      "lr": 9.757796523517384e-06,
      "objective/entropy": -111.42575073242188,
      "objective/kl": 28.885374069213867,
      "objective/non_score_reward": -1.4442687034606934,
      "objective/rlhf_reward": -4.398472824183804,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 79.57511901855469,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.61328125,
      "step": 379,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9979077577590942
    },
    {
      "episode": 6096,
      "epoch": 0.0365244275083582,
      "loss/policy_avg": -0.022392742335796356,
      "lr": 9.75715746421268e-06,
      "objective/entropy": -79.86695098876953,
      "objective/kl": 17.694236755371094,
      "objective/non_score_reward": -0.8847118616104126,
      "objective/rlhf_reward": -1.7140187576142063,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 4.339657306671143,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.767578125,
      "step": 380,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0013508796691895
    },
    {
      "episode": 6112,
      "epoch": 0.03662029214748775,
      "loss/policy_avg": 0.4459357261657715,
      "lr": 9.756518404907976e-06,
      "objective/entropy": -148.62872314453125,
      "objective/kl": 21.098934173583984,
      "objective/non_score_reward": -1.054946780204773,
      "objective/rlhf_reward": 0.1802129983901981,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 6.359186172485352,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.576171875,
      "step": 381,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9992458820343018
    },
    {
      "episode": 6128,
      "epoch": 0.0367161567866173,
      "loss/policy_avg": -0.012147974222898483,
      "lr": 9.755879345603273e-06,
      "objective/entropy": 152.35232543945312,
      "objective/kl": 31.486684799194336,
      "objective/non_score_reward": -1.5743342638015747,
      "objective/rlhf_reward": -3.3736180409204692,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 35.51153564453125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5859375,
      "step": 382,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.999483585357666
    },
    {
      "episode": 6144,
      "epoch": 0.03681202142574685,
      "loss/policy_avg": 0.012859173119068146,
      "lr": 9.75524028629857e-06,
      "objective/entropy": 26.343887329101562,
      "objective/kl": 33.34328079223633,
      "objective/non_score_reward": -1.6671642065048218,
      "objective/rlhf_reward": -4.721245358662541,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 22.614994049072266,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.82421875,
      "step": 383,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0018911361694336
    },
    {
      "episode": 6160,
      "epoch": 0.0369078860648764,
      "loss/policy_avg": 0.21653258800506592,
      "lr": 9.754601226993867e-06,
      "objective/entropy": 109.49678039550781,
      "objective/kl": 43.73469543457031,
      "objective/non_score_reward": -2.186734676361084,
      "objective/rlhf_reward": -7.296340326876983,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 31.000137329101562,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.689453125,
      "step": 384,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 2.001706123352051
    },
    {
      "episode": 6176,
      "epoch": 0.037003750704005946,
      "loss/policy_avg": 0.17637991905212402,
      "lr": 9.753962167689162e-06,
      "objective/entropy": -57.256038665771484,
      "objective/kl": 20.548786163330078,
      "objective/non_score_reward": -1.0274393558502197,
      "objective/rlhf_reward": -1.9870514891305304,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 7.945226669311523,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.71484375,
      "step": 385,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000217914581299
    },
    {
      "episode": 6192,
      "epoch": 0.037099615343135495,
      "loss/policy_avg": 0.23474755883216858,
      "lr": 9.753323108384459e-06,
      "objective/entropy": -67.67970275878906,
      "objective/kl": 29.886417388916016,
      "objective/non_score_reward": -1.4943209886550903,
      "objective/rlhf_reward": -4.461511933597263,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 30.2872314453125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.517578125,
      "step": 386,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9968770742416382
    },
    {
      "episode": 6208,
      "epoch": 0.037195479982265044,
      "loss/policy_avg": 3.0326309204101562,
      "lr": 9.752684049079756e-06,
      "objective/entropy": -30.304298400878906,
      "objective/kl": 34.21199035644531,
      "objective/non_score_reward": -1.710599660873413,
      "objective/rlhf_reward": -5.391800324530944,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 191.66567993164062,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62109375,
      "step": 387,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.998448133468628
    },
    {
      "episode": 6224,
      "epoch": 0.03729134462139459,
      "loss/policy_avg": 0.020012550055980682,
      "lr": 9.752044989775053e-06,
      "objective/entropy": -44.4876594543457,
      "objective/kl": 30.23657989501953,
      "objective/non_score_reward": -1.5118290185928345,
      "objective/rlhf_reward": -4.099904905037816,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 21.57486915588379,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.767578125,
      "step": 388,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.002869129180908
    },
    {
      "episode": 6240,
      "epoch": 0.03738720926052414,
      "loss/policy_avg": 0.33562996983528137,
      "lr": 9.751405930470348e-06,
      "objective/entropy": -154.47891235351562,
      "objective/kl": 18.6168155670166,
      "objective/non_score_reward": -0.9308407306671143,
      "objective/rlhf_reward": -3.723362982273102,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 13.14146614074707,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.65625,
      "step": 389,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0002055168151855
    },
    {
      "episode": 6256,
      "epoch": 0.03748307389965369,
      "loss/policy_avg": 0.037651438266038895,
      "lr": 9.750766871165645e-06,
      "objective/entropy": -6.050981521606445,
      "objective/kl": 26.29869270324707,
      "objective/non_score_reward": -1.3149347305297852,
      "objective/rlhf_reward": -5.25973904132843,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 27.001697540283203,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.759765625,
      "step": 390,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9982492923736572
    },
    {
      "episode": 6272,
      "epoch": 0.03757893853878324,
      "loss/policy_avg": 0.1277342140674591,
      "lr": 9.750127811860941e-06,
      "objective/entropy": -114.59310913085938,
      "objective/kl": 33.31782531738281,
      "objective/non_score_reward": -1.6658912897109985,
      "objective/rlhf_reward": -3.739846025348875,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 30.69461441040039,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.755859375,
      "step": 391,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9998853206634521
    },
    {
      "episode": 6288,
      "epoch": 0.03767480317791279,
      "loss/policy_avg": 0.08161749690771103,
      "lr": 9.749488752556238e-06,
      "objective/entropy": 28.02770233154297,
      "objective/kl": 25.580188751220703,
      "objective/non_score_reward": -1.279009461402893,
      "objective/rlhf_reward": -3.6654397054627985,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 11.5637845993042,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.3642578125,
      "step": 392,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9969701766967773
    },
    {
      "episode": 6304,
      "epoch": 0.03777066781704234,
      "loss/policy_avg": 0.013617899268865585,
      "lr": 9.748849693251534e-06,
      "objective/entropy": 137.66958618164062,
      "objective/kl": 36.88829040527344,
      "objective/non_score_reward": -1.8444143533706665,
      "objective/rlhf_reward": -5.999055602637631,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 3.8839926719665527,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.703125,
      "step": 393,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9998722076416016
    },
    {
      "episode": 6320,
      "epoch": 0.037866532456171886,
      "loss/policy_avg": 0.7664667963981628,
      "lr": 9.74821063394683e-06,
      "objective/entropy": 12.1875,
      "objective/kl": 27.703767776489258,
      "objective/non_score_reward": -1.385188341140747,
      "objective/rlhf_reward": -4.181503379081173,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 29.00311279296875,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.64453125,
      "step": 394,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9983487129211426
    },
    {
      "episode": 6336,
      "epoch": 0.037962397095301435,
      "loss/policy_avg": 0.13891640305519104,
      "lr": 9.747571574642127e-06,
      "objective/entropy": -52.291236877441406,
      "objective/kl": 29.62856101989746,
      "objective/non_score_reward": -1.4814281463623047,
      "objective/rlhf_reward": -4.10088383701713,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 45.48643112182617,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.763671875,
      "step": 395,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9967341423034668
    },
    {
      "episode": 6352,
      "epoch": 0.038058261734430984,
      "loss/policy_avg": -0.5259265899658203,
      "lr": 9.746932515337424e-06,
      "objective/entropy": -14.848602294921875,
      "objective/kl": 36.51825714111328,
      "objective/non_score_reward": -1.8259128332138062,
      "objective/rlhf_reward": -5.180944981352363,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 133.36766052246094,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.673828125,
      "step": 396,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.008728504180908
    },
    {
      "episode": 6368,
      "epoch": 0.03815412637356053,
      "loss/policy_avg": 0.1340530812740326,
      "lr": 9.746293456032721e-06,
      "objective/entropy": -13.48861312866211,
      "objective/kl": 24.147233963012695,
      "objective/non_score_reward": -1.2073616981506348,
      "objective/rlhf_reward": -3.0961134592692057,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 4.865433216094971,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.787109375,
      "step": 397,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0002119541168213
    },
    {
      "episode": 6384,
      "epoch": 0.03824999101269008,
      "loss/policy_avg": 0.036313191056251526,
      "lr": 9.745654396728016e-06,
      "objective/entropy": -118.45596313476562,
      "objective/kl": 26.90463638305664,
      "objective/non_score_reward": -1.3452317714691162,
      "objective/rlhf_reward": -3.5560982182350864,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 7.153594017028809,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.708984375,
      "step": 398,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0009984970092773
    },
    {
      "episode": 6400,
      "epoch": 0.03834585565181963,
      "loss/policy_avg": 0.07543957978487015,
      "lr": 9.745015337423313e-06,
      "objective/entropy": 5.307586669921875,
      "objective/kl": 29.030933380126953,
      "objective/non_score_reward": -1.4515466690063477,
      "objective/rlhf_reward": -2.88246778094885,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 14.018705368041992,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.611328125,
      "step": 399,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9984686374664307
    },
    {
      "episode": 6416,
      "epoch": 0.03844172029094918,
      "loss/policy_avg": 0.11864852905273438,
      "lr": 9.74437627811861e-06,
      "objective/entropy": 10.484695434570312,
      "objective/kl": 24.462554931640625,
      "objective/non_score_reward": -1.2231277227401733,
      "objective/rlhf_reward": -3.376739227565464,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 14.423017501831055,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.685546875,
      "step": 400,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9988946914672852
    },
    {
      "episode": 6432,
      "epoch": 0.03853758493007873,
      "loss/policy_avg": -0.036792345345020294,
      "lr": 9.743737218813907e-06,
      "objective/entropy": -181.87400817871094,
      "objective/kl": 23.07555389404297,
      "objective/non_score_reward": -1.153777837753296,
      "objective/rlhf_reward": -3.191279132564632,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 20.132736206054688,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.708984375,
      "step": 401,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.00081729888916
    },
    {
      "episode": 6448,
      "epoch": 0.03863344956920828,
      "loss/policy_avg": 0.22927281260490417,
      "lr": 9.743098159509204e-06,
      "objective/entropy": -88.96450805664062,
      "objective/kl": 32.569129943847656,
      "objective/non_score_reward": -1.6284565925598145,
      "objective/rlhf_reward": -4.780492917696634,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 49.499900817871094,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.70703125,
      "step": 402,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9982388019561768
    },
    {
      "episode": 6464,
      "epoch": 0.03872931420833783,
      "loss/policy_avg": 0.30984753370285034,
      "lr": 9.7424591002045e-06,
      "objective/entropy": -18.365474700927734,
      "objective/kl": 31.77776336669922,
      "objective/non_score_reward": -1.5888882875442505,
      "objective/rlhf_reward": -5.030040267735643,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 36.973690032958984,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.607421875,
      "step": 403,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9970709085464478
    },
    {
      "episode": 6480,
      "epoch": 0.038825178847467376,
      "loss/policy_avg": 0.06557717174291611,
      "lr": 9.741820040899796e-06,
      "objective/entropy": -141.13568115234375,
      "objective/kl": 28.107177734375,
      "objective/non_score_reward": -1.405358910560608,
      "objective/rlhf_reward": -3.674024294094975,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 31.192813873291016,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.791015625,
      "step": 404,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9952688217163086
    },
    {
      "episode": 6496,
      "epoch": 0.038921043486596925,
      "loss/policy_avg": 0.05502002686262131,
      "lr": 9.741180981595093e-06,
      "objective/entropy": 32.80726623535156,
      "objective/kl": 44.297119140625,
      "objective/non_score_reward": -2.2148561477661133,
      "objective/rlhf_reward": -7.5001741287454795,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 19.57358169555664,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.650390625,
      "step": 405,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9999144077301025
    },
    {
      "episode": 6512,
      "epoch": 0.039016908125726474,
      "loss/policy_avg": 0.026680059731006622,
      "lr": 9.74054192229039e-06,
      "objective/entropy": 119.29817962646484,
      "objective/kl": 39.39287567138672,
      "objective/non_score_reward": -1.9696437120437622,
      "objective/rlhf_reward": -6.536938837080627,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 0.6370775699615479,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6171875,
      "step": 406,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0018253326416016
    },
    {
      "episode": 6528,
      "epoch": 0.03911277276485602,
      "loss/policy_avg": 0.6271831393241882,
      "lr": 9.739902862985686e-06,
      "objective/entropy": 6.752727508544922,
      "objective/kl": 25.43050765991211,
      "objective/non_score_reward": -1.2715253829956055,
      "objective/rlhf_reward": -5.086101770401001,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 17.81015396118164,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.74609375,
      "step": 407,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9977548122406006
    },
    {
      "episode": 6544,
      "epoch": 0.03920863740398557,
      "loss/policy_avg": 0.30096232891082764,
      "lr": 9.739263803680983e-06,
      "objective/entropy": -24.516462326049805,
      "objective/kl": 38.53913116455078,
      "objective/non_score_reward": -1.9269566535949707,
      "objective/rlhf_reward": -5.585120143667732,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 15.016406059265137,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.603515625,
      "step": 408,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.994694709777832
    },
    {
      "episode": 6560,
      "epoch": 0.03930450204311512,
      "loss/policy_avg": 0.03762083500623703,
      "lr": 9.73862474437628e-06,
      "objective/entropy": -218.5489501953125,
      "objective/kl": 26.699615478515625,
      "objective/non_score_reward": -1.3349807262420654,
      "objective/rlhf_reward": -3.6780635170346363,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 59.4561653137207,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.54296875,
      "step": 409,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9982738494873047
    },
    {
      "episode": 6576,
      "epoch": 0.03940036668224467,
      "loss/policy_avg": 0.2932765483856201,
      "lr": 9.737985685071575e-06,
      "objective/entropy": -25.477672576904297,
      "objective/kl": 35.529788970947266,
      "objective/non_score_reward": -1.776489496231079,
      "objective/rlhf_reward": -5.372624413172403,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 39.98287582397461,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.55859375,
      "step": 410,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.999699354171753
    },
    {
      "episode": 6592,
      "epoch": 0.03949623132137422,
      "loss/policy_avg": -0.2486688196659088,
      "lr": 9.737346625766872e-06,
      "objective/entropy": -12.952373504638672,
      "objective/kl": 33.62919616699219,
      "objective/non_score_reward": -1.681459903717041,
      "objective/rlhf_reward": -4.778428207116063,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 28.626731872558594,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.56640625,
      "step": 411,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.003122568130493
    },
    {
      "episode": 6608,
      "epoch": 0.03959209596050377,
      "loss/policy_avg": 0.3249208629131317,
      "lr": 9.736707566462167e-06,
      "objective/entropy": -52.927459716796875,
      "objective/kl": 33.82263946533203,
      "objective/non_score_reward": -1.6911320686340332,
      "objective/rlhf_reward": -4.364528393745422,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 41.674591064453125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.791015625,
      "step": 412,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.000582695007324
    },
    {
      "episode": 6624,
      "epoch": 0.039687960599633317,
      "loss/policy_avg": 0.15019002556800842,
      "lr": 9.736068507157464e-06,
      "objective/entropy": -22.71458625793457,
      "objective/kl": 32.99541473388672,
      "objective/non_score_reward": -1.6497704982757568,
      "objective/rlhf_reward": -5.257446458845763,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 6.256417274475098,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6484375,
      "step": 413,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9978971481323242
    },
    {
      "episode": 6640,
      "epoch": 0.039783825238762865,
      "loss/policy_avg": 0.296099990606308,
      "lr": 9.735429447852761e-06,
      "objective/entropy": -10.485054016113281,
      "objective/kl": 28.53786277770996,
      "objective/non_score_reward": -1.4268931150436401,
      "objective/rlhf_reward": -3.9742393652598063,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 4.458545684814453,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.703125,
      "step": 414,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9996311664581299
    },
    {
      "episode": 6656,
      "epoch": 0.039879689877892414,
      "loss/policy_avg": 0.3615366816520691,
      "lr": 9.734790388548058e-06,
      "objective/entropy": -102.9046859741211,
      "objective/kl": 19.901390075683594,
      "objective/non_score_reward": -0.9950695037841797,
      "objective/rlhf_reward": -2.3184185675984486,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 18.427024841308594,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.82421875,
      "step": 415,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.999051809310913
    },
    {
      "episode": 6672,
      "epoch": 0.03997555451702196,
      "loss/policy_avg": 0.14772659540176392,
      "lr": 9.734151329243355e-06,
      "objective/entropy": -148.49395751953125,
      "objective/kl": 26.190744400024414,
      "objective/non_score_reward": -1.3095372915267944,
      "objective/rlhf_reward": -3.6340291834512524,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 59.936073303222656,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.806640625,
      "step": 416,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.001544237136841
    },
    {
      "episode": 6688,
      "epoch": 0.04007141915615151,
      "loss/policy_avg": 0.23557257652282715,
      "lr": 9.73351226993865e-06,
      "objective/entropy": -145.32284545898438,
      "objective/kl": 30.992046356201172,
      "objective/non_score_reward": -1.5496025085449219,
      "objective/rlhf_reward": -3.7984096765518185,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 7.065143585205078,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.904296875,
      "step": 417,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9989118576049805
    },
    {
      "episode": 6704,
      "epoch": 0.04016728379528106,
      "loss/policy_avg": 0.12179827690124512,
      "lr": 9.732873210633947e-06,
      "objective/entropy": -64.65836334228516,
      "objective/kl": 35.22796630859375,
      "objective/non_score_reward": -1.7613983154296875,
      "objective/rlhf_reward": -5.686343335841579,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 61.170570373535156,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5546875,
      "step": 418,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9985511302947998
    },
    {
      "episode": 6720,
      "epoch": 0.04026314843441061,
      "loss/policy_avg": -0.043803490698337555,
      "lr": 9.732234151329244e-06,
      "objective/entropy": -87.70707702636719,
      "objective/kl": 28.95832061767578,
      "objective/non_score_reward": -1.447916030883789,
      "objective/rlhf_reward": -4.275892340930637,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 2.8885676860809326,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.763671875,
      "step": 419,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9999752044677734
    },
    {
      "episode": 6736,
      "epoch": 0.04035901307354016,
      "loss/policy_avg": 0.18042519688606262,
      "lr": 9.73159509202454e-06,
      "objective/entropy": -4.936176300048828,
      "objective/kl": 30.613567352294922,
      "objective/non_score_reward": -1.5306785106658936,
      "objective/rlhf_reward": -4.722713804244995,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 209.10888671875,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.619140625,
      "step": 420,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9993661642074585
    },
    {
      "episode": 6752,
      "epoch": 0.04045487771266971,
      "loss/policy_avg": 0.6567588448524475,
      "lr": 9.730956032719838e-06,
      "objective/entropy": -162.10116577148438,
      "objective/kl": 33.140079498291016,
      "objective/non_score_reward": -1.6570039987564087,
      "objective/rlhf_reward": -4.505309881941352,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 33.703067779541016,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7578125,
      "step": 421,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9999632835388184
    },
    {
      "episode": 6768,
      "epoch": 0.04055074235179926,
      "loss/policy_avg": 0.5961964130401611,
      "lr": 9.730316973415135e-06,
      "objective/entropy": 18.374740600585938,
      "objective/kl": 36.82442092895508,
      "objective/non_score_reward": -1.8412209749221802,
      "objective/rlhf_reward": -4.441164646984312,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 62.1960334777832,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.654296875,
      "step": 422,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9999306201934814
    },
    {
      "episode": 6784,
      "epoch": 0.040646606990928806,
      "loss/policy_avg": 0.19755011796951294,
      "lr": 9.72967791411043e-06,
      "objective/entropy": -57.290000915527344,
      "objective/kl": 30.764808654785156,
      "objective/non_score_reward": -1.5382404327392578,
      "objective/rlhf_reward": -4.811326077490478,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 37.60175323486328,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.67578125,
      "step": 423,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9990873336791992
    },
    {
      "episode": 6800,
      "epoch": 0.040742471630058355,
      "loss/policy_avg": 0.2760317325592041,
      "lr": 9.729038854805727e-06,
      "objective/entropy": -54.2406005859375,
      "objective/kl": 28.681961059570312,
      "objective/non_score_reward": -1.4340981245040894,
      "objective/rlhf_reward": -3.7889812094735458,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 13.514376640319824,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.548828125,
      "step": 424,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0004029273986816
    },
    {
      "episode": 6816,
      "epoch": 0.040838336269187904,
      "loss/policy_avg": 0.05885821580886841,
      "lr": 9.728399795501023e-06,
      "objective/entropy": -30.280364990234375,
      "objective/kl": 31.102825164794922,
      "objective/non_score_reward": -1.5551413297653198,
      "objective/rlhf_reward": -4.820565319061279,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 61.290470123291016,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.767578125,
      "step": 425,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9986295700073242
    },
    {
      "episode": 6832,
      "epoch": 0.04093420090831745,
      "loss/policy_avg": 0.044344570487737656,
      "lr": 9.72776073619632e-06,
      "objective/entropy": -223.16510009765625,
      "objective/kl": 11.546382904052734,
      "objective/non_score_reward": -0.5773191452026367,
      "objective/rlhf_reward": -0.3618654114770252,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 2.5684561729431152,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7109375,
      "step": 426,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 2.0002474784851074
    },
    {
      "episode": 6848,
      "epoch": 0.041030065547447,
      "loss/policy_avg": 0.11938305199146271,
      "lr": 9.727121676891617e-06,
      "objective/entropy": -84.6756362915039,
      "objective/kl": 32.253173828125,
      "objective/non_score_reward": -1.6126585006713867,
      "objective/rlhf_reward": -5.000035624118194,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 54.86524963378906,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.693359375,
      "step": 427,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9985135793685913
    },
    {
      "episode": 6864,
      "epoch": 0.04112593018657655,
      "loss/policy_avg": -0.02704887092113495,
      "lr": 9.726482617586912e-06,
      "objective/entropy": 61.31664276123047,
      "objective/kl": 50.535186767578125,
      "objective/non_score_reward": -2.526759624481201,
      "objective/rlhf_reward": -8.765402606039672,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 87.70621490478516,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4794921875,
      "step": 428,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0009117126464844
    },
    {
      "episode": 6880,
      "epoch": 0.0412217948257061,
      "loss/policy_avg": 0.3563253581523895,
      "lr": 9.72584355828221e-06,
      "objective/entropy": -201.59555053710938,
      "objective/kl": 26.542133331298828,
      "objective/non_score_reward": -1.3271067142486572,
      "objective/rlhf_reward": -2.384707783104154,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 12.606565475463867,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.60546875,
      "step": 429,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9991586208343506
    },
    {
      "episode": 6896,
      "epoch": 0.04131765946483565,
      "loss/policy_avg": 0.3849369287490845,
      "lr": 9.725204498977506e-06,
      "objective/entropy": -172.11151123046875,
      "objective/kl": 31.27842140197754,
      "objective/non_score_reward": -1.5639209747314453,
      "objective/rlhf_reward": -4.52235098282496,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 35.41864776611328,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.794921875,
      "step": 430,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9977457523345947
    },
    {
      "episode": 6912,
      "epoch": 0.0414135241039652,
      "loss/policy_avg": 0.5410929918289185,
      "lr": 9.724565439672803e-06,
      "objective/entropy": -53.43696594238281,
      "objective/kl": 36.75939178466797,
      "objective/non_score_reward": -1.8379695415496826,
      "objective/rlhf_reward": -5.229172053114448,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 11.017414093017578,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.51953125,
      "step": 431,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9961919784545898
    },
    {
      "episode": 6928,
      "epoch": 0.041509388743094754,
      "loss/policy_avg": 0.5185568332672119,
      "lr": 9.7239263803681e-06,
      "objective/entropy": -42.49586486816406,
      "objective/kl": 31.465147018432617,
      "objective/non_score_reward": -1.5732574462890625,
      "objective/rlhf_reward": -4.914427437869412,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 1.669852614402771,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.62109375,
      "step": 432,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9998645782470703
    },
    {
      "episode": 6944,
      "epoch": 0.0416052533822243,
      "loss/policy_avg": -0.09886922687292099,
      "lr": 9.723287321063397e-06,
      "objective/entropy": -182.28286743164062,
      "objective/kl": 27.1431884765625,
      "objective/non_score_reward": -1.3571594953536987,
      "objective/rlhf_reward": -3.6953046480814615,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 25.096237182617188,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.70703125,
      "step": 433,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0018882751464844
    },
    {
      "episode": 6960,
      "epoch": 0.04170111802135385,
      "loss/policy_avg": 0.39349502325057983,
      "lr": 9.722648261758692e-06,
      "objective/entropy": 28.20358657836914,
      "objective/kl": 38.92597198486328,
      "objective/non_score_reward": -1.946298599243164,
      "objective/rlhf_reward": -6.385194158554077,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 46.153385162353516,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4912109375,
      "step": 434,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9992406368255615
    },
    {
      "episode": 6976,
      "epoch": 0.0417969826604834,
      "loss/policy_avg": 0.3586619198322296,
      "lr": 9.722009202453989e-06,
      "objective/entropy": -126.02680206298828,
      "objective/kl": 32.40974807739258,
      "objective/non_score_reward": -1.6204874515533447,
      "objective/rlhf_reward": -4.534538338856633,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 10.944326400756836,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.666015625,
      "step": 435,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9971150159835815
    },
    {
      "episode": 6992,
      "epoch": 0.04189284729961295,
      "loss/policy_avg": -0.4687817692756653,
      "lr": 9.721370143149284e-06,
      "objective/entropy": -69.42359924316406,
      "objective/kl": 20.10685157775879,
      "objective/non_score_reward": -1.0053426027297974,
      "objective/rlhf_reward": -2.6427683430291236,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 22.483867645263672,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.6328125,
      "step": 436,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.041172504425049
    },
    {
      "episode": 7008,
      "epoch": 0.0419887119387425,
      "loss/policy_avg": 0.0906272605061531,
      "lr": 9.720731083844581e-06,
      "objective/entropy": -149.47274780273438,
      "objective/kl": 26.28115463256836,
      "objective/non_score_reward": -1.3140578269958496,
      "objective/rlhf_reward": -3.1335249564805365,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 2.7223973274230957,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.599609375,
      "step": 437,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.000173568725586
    },
    {
      "episode": 7024,
      "epoch": 0.04208457657787205,
      "loss/policy_avg": 0.3348531126976013,
      "lr": 9.720092024539878e-06,
      "objective/entropy": 22.56686782836914,
      "objective/kl": 36.523582458496094,
      "objective/non_score_reward": -1.8261791467666626,
      "objective/rlhf_reward": -5.700596723620015,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 20.443164825439453,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.515625,
      "step": 438,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9979515075683594
    },
    {
      "episode": 7040,
      "epoch": 0.042180441217001596,
      "loss/policy_avg": 0.04725319519639015,
      "lr": 9.719452965235175e-06,
      "objective/entropy": -71.08361053466797,
      "objective/kl": 20.915573120117188,
      "objective/non_score_reward": -1.045778751373291,
      "objective/rlhf_reward": -2.0604087731995917,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 7.088305473327637,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4462890625,
      "step": 439,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0024590492248535
    },
    {
      "episode": 7056,
      "epoch": 0.042276305856131145,
      "loss/policy_avg": 0.18381188809871674,
      "lr": 9.718813905930472e-06,
      "objective/entropy": 25.569873809814453,
      "objective/kl": 38.07762145996094,
      "objective/non_score_reward": -1.9038809537887573,
      "objective/rlhf_reward": -3.215523815155029,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 30.962854385375977,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.62890625,
      "step": 440,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 2.0011448860168457
    },
    {
      "episode": 7072,
      "epoch": 0.042372170495260694,
      "loss/policy_avg": 0.1967303454875946,
      "lr": 9.718174846625767e-06,
      "objective/entropy": -103.38803100585938,
      "objective/kl": 29.222076416015625,
      "objective/non_score_reward": -1.4611037969589233,
      "objective/rlhf_reward": -4.240295205179768,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 8.899417877197266,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.689453125,
      "step": 441,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9986486434936523
    },
    {
      "episode": 7088,
      "epoch": 0.04246803513439024,
      "loss/policy_avg": -0.07635466754436493,
      "lr": 9.717535787321064e-06,
      "objective/entropy": -54.58887481689453,
      "objective/kl": 35.043663024902344,
      "objective/non_score_reward": -1.752183198928833,
      "objective/rlhf_reward": -5.527779820378184,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 12.18149185180664,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.580078125,
      "step": 442,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0013041496276855
    },
    {
      "episode": 7104,
      "epoch": 0.04256389977351979,
      "loss/policy_avg": 0.3104819059371948,
      "lr": 9.71689672801636e-06,
      "objective/entropy": -53.842830657958984,
      "objective/kl": 23.18008804321289,
      "objective/non_score_reward": -1.1590044498443604,
      "objective/rlhf_reward": -3.0797587921291143,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 86.82899475097656,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.91796875,
      "step": 443,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9991846084594727
    },
    {
      "episode": 7120,
      "epoch": 0.04265976441264934,
      "loss/policy_avg": 0.6317604780197144,
      "lr": 9.716257668711657e-06,
      "objective/entropy": -21.19356918334961,
      "objective/kl": 30.069751739501953,
      "objective/non_score_reward": -1.503487467765808,
      "objective/rlhf_reward": -4.6353477025903285,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 128.40951538085938,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.76953125,
      "step": 444,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.997659683227539
    },
    {
      "episode": 7136,
      "epoch": 0.04275562905177889,
      "loss/policy_avg": 0.33194229006767273,
      "lr": 9.715618609406954e-06,
      "objective/entropy": -102.48907470703125,
      "objective/kl": 32.374549865722656,
      "objective/non_score_reward": -1.6187275648117065,
      "objective/rlhf_reward": -6.474910318851471,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 7.681756973266602,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.59375,
      "step": 445,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.998705506324768
    },
    {
      "episode": 7152,
      "epoch": 0.04285149369090844,
      "loss/policy_avg": 0.26850253343582153,
      "lr": 9.714979550102251e-06,
      "objective/entropy": 69.35136413574219,
      "objective/kl": 26.097612380981445,
      "objective/non_score_reward": -1.3048806190490723,
      "objective/rlhf_reward": -3.738569977696299,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 62.56462097167969,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6640625,
      "step": 446,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.99786376953125
    },
    {
      "episode": 7168,
      "epoch": 0.04294735833003799,
      "loss/policy_avg": -0.1885017603635788,
      "lr": 9.714340490797546e-06,
      "objective/entropy": -16.98421859741211,
      "objective/kl": 30.90627670288086,
      "objective/non_score_reward": -1.5453139543533325,
      "objective/rlhf_reward": -4.577135715548115,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 11.766645431518555,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.73828125,
      "step": 447,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.003262996673584
    },
    {
      "episode": 7184,
      "epoch": 0.04304322296916754,
      "loss/policy_avg": 0.24147900938987732,
      "lr": 9.713701431492843e-06,
      "objective/entropy": -196.87869262695312,
      "objective/kl": 23.231670379638672,
      "objective/non_score_reward": -1.161583662033081,
      "objective/rlhf_reward": -3.1305624780976142,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 19.03369903564453,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.69140625,
      "step": 448,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9996755123138428
    },
    {
      "episode": 7200,
      "epoch": 0.043139087608297086,
      "loss/policy_avg": 0.3051467537879944,
      "lr": 9.71306237218814e-06,
      "objective/entropy": -54.2137565612793,
      "objective/kl": 33.54918670654297,
      "objective/non_score_reward": -1.6774592399597168,
      "objective/rlhf_reward": -5.047977810323822,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 74.37176513671875,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.783203125,
      "step": 449,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9967325925827026
    },
    {
      "episode": 7216,
      "epoch": 0.043234952247426635,
      "loss/policy_avg": 0.0008301436901092529,
      "lr": 9.712423312883437e-06,
      "objective/entropy": -37.864322662353516,
      "objective/kl": 24.052818298339844,
      "objective/non_score_reward": -1.2026410102844238,
      "objective/rlhf_reward": -2.9857349946823826,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 1.6498993635177612,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.525390625,
      "step": 450,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.001569986343384
    },
    {
      "episode": 7232,
      "epoch": 0.043330816886556184,
      "loss/policy_avg": 0.10217726975679398,
      "lr": 9.711784253578734e-06,
      "objective/entropy": -97.12496948242188,
      "objective/kl": 20.143707275390625,
      "objective/non_score_reward": -1.007185459136963,
      "objective/rlhf_reward": -2.669491672252102,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 37.34214401245117,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.783203125,
      "step": 451,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9993760585784912
    },
    {
      "episode": 7248,
      "epoch": 0.04342668152568573,
      "loss/policy_avg": 0.2181258350610733,
      "lr": 9.711145194274029e-06,
      "objective/entropy": -187.07266235351562,
      "objective/kl": 22.520824432373047,
      "objective/non_score_reward": -1.1260414123535156,
      "objective/rlhf_reward": -2.9000454283395585,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 80.40426635742188,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.654296875,
      "step": 452,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000328540802002
    },
    {
      "episode": 7264,
      "epoch": 0.04352254616481528,
      "loss/policy_avg": 0.28700706362724304,
      "lr": 9.710506134969326e-06,
      "objective/entropy": -119.91871643066406,
      "objective/kl": 30.88311004638672,
      "objective/non_score_reward": -1.5441553592681885,
      "objective/rlhf_reward": -4.834986022024779,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 14.897968292236328,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.787109375,
      "step": 453,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9972997903823853
    },
    {
      "episode": 7280,
      "epoch": 0.04361841080394483,
      "loss/policy_avg": 0.013649387285113335,
      "lr": 9.709867075664623e-06,
      "objective/entropy": -137.84861755371094,
      "objective/kl": 35.624549865722656,
      "objective/non_score_reward": -1.781227707862854,
      "objective/rlhf_reward": -7.124910950660706,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 77.14759826660156,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.712890625,
      "step": 454,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.999969720840454
    },
    {
      "episode": 7296,
      "epoch": 0.04371427544307438,
      "loss/policy_avg": 0.9055305123329163,
      "lr": 9.70922801635992e-06,
      "objective/entropy": -177.1896514892578,
      "objective/kl": 34.19129943847656,
      "objective/non_score_reward": -1.7095649242401123,
      "objective/rlhf_reward": -5.387661199183807,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 51.92662811279297,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.66796875,
      "step": 455,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9976481199264526
    },
    {
      "episode": 7312,
      "epoch": 0.04381014008220393,
      "loss/policy_avg": -0.14486947655677795,
      "lr": 9.708588957055215e-06,
      "objective/entropy": -91.43609619140625,
      "objective/kl": 30.12580108642578,
      "objective/non_score_reward": -1.5062901973724365,
      "objective/rlhf_reward": -4.509388887675938,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 24.85628890991211,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.611328125,
      "step": 456,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.011446952819824
    },
    {
      "episode": 7328,
      "epoch": 0.04390600472133348,
      "loss/policy_avg": 0.3115137815475464,
      "lr": 9.707949897750512e-06,
      "objective/entropy": -33.496673583984375,
      "objective/kl": 24.4674072265625,
      "objective/non_score_reward": -1.2233703136444092,
      "objective/rlhf_reward": -3.377709650787052,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 9.057685852050781,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.751953125,
      "step": 457,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0009520053863525
    },
    {
      "episode": 7344,
      "epoch": 0.044001869360463026,
      "loss/policy_avg": 1.4892723560333252,
      "lr": 9.707310838445809e-06,
      "objective/entropy": -35.618934631347656,
      "objective/kl": 27.64456558227539,
      "objective/non_score_reward": -1.3822282552719116,
      "objective/rlhf_reward": -3.5815017921494796,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 1.899414300918579,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.57421875,
      "step": 458,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9999825954437256
    },
    {
      "episode": 7360,
      "epoch": 0.044097733999592575,
      "loss/policy_avg": 0.022264737635850906,
      "lr": 9.706671779141105e-06,
      "objective/entropy": 31.060089111328125,
      "objective/kl": 34.85979461669922,
      "objective/non_score_reward": -1.7429897785186768,
      "objective/rlhf_reward": -5.367839369837361,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 7.1077799797058105,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.751953125,
      "step": 459,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9993565082550049
    },
    {
      "episode": 7376,
      "epoch": 0.044193598638722124,
      "loss/policy_avg": 0.08219340443611145,
      "lr": 9.7060327198364e-06,
      "objective/entropy": -69.6414566040039,
      "objective/kl": 35.42669677734375,
      "objective/non_score_reward": -1.7713346481323242,
      "objective/rlhf_reward": -5.726088785861416,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 21.27887535095215,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.712890625,
      "step": 460,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0088043212890625
    },
    {
      "episode": 7392,
      "epoch": 0.04428946327785167,
      "loss/policy_avg": 0.03685396909713745,
      "lr": 9.705393660531698e-06,
      "objective/entropy": -245.04380798339844,
      "objective/kl": 21.42380142211914,
      "objective/non_score_reward": -1.0711899995803833,
      "objective/rlhf_reward": -2.1620538852372504,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 1.849046230316162,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.767578125,
      "step": 461,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.008730411529541
    },
    {
      "episode": 7408,
      "epoch": 0.04438532791698122,
      "loss/policy_avg": 0.5492111444473267,
      "lr": 9.704754601226994e-06,
      "objective/entropy": 9.25466537475586,
      "objective/kl": 20.997167587280273,
      "objective/non_score_reward": -1.0498583316802979,
      "objective/rlhf_reward": -1.2757146700632302,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 36.03380584716797,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.9296875,
      "step": 462,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.000826120376587
    },
    {
      "episode": 7424,
      "epoch": 0.04448119255611077,
      "loss/policy_avg": 0.22961178421974182,
      "lr": 9.704115541922291e-06,
      "objective/entropy": -2.9236984252929688,
      "objective/kl": 26.89717674255371,
      "objective/non_score_reward": -1.3448588848114014,
      "objective/rlhf_reward": -3.717576061905013,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 133.2696075439453,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.8984375,
      "step": 463,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.999076008796692
    },
    {
      "episode": 7440,
      "epoch": 0.04457705719524032,
      "loss/policy_avg": 0.1330358386039734,
      "lr": 9.703476482617588e-06,
      "objective/entropy": -155.3049774169922,
      "objective/kl": 32.32700729370117,
      "objective/non_score_reward": -1.6163502931594849,
      "objective/rlhf_reward": -2.0654012918472286,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 352.436767578125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.86328125,
      "step": 464,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9973843097686768
    },
    {
      "episode": 7456,
      "epoch": 0.04467292183436987,
      "loss/policy_avg": 0.13191767036914825,
      "lr": 9.702837423312883e-06,
      "objective/entropy": -130.06350708007812,
      "objective/kl": 31.98480987548828,
      "objective/non_score_reward": -1.5992405414581299,
      "objective/rlhf_reward": -5.07144889596097,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 4.149503707885742,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.654296875,
      "step": 465,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9979965686798096
    },
    {
      "episode": 7472,
      "epoch": 0.04476878647349942,
      "loss/policy_avg": 0.11230316013097763,
      "lr": 9.70219836400818e-06,
      "objective/entropy": 11.579151153564453,
      "objective/kl": 34.1675910949707,
      "objective/non_score_reward": -1.7083796262741089,
      "objective/rlhf_reward": -5.3525657681778664,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 13.865779876708984,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.708984375,
      "step": 466,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.00162410736084
    },
    {
      "episode": 7488,
      "epoch": 0.04486465111262897,
      "loss/policy_avg": 0.2810555398464203,
      "lr": 9.701559304703477e-06,
      "objective/entropy": -138.13914489746094,
      "objective/kl": 22.91815948486328,
      "objective/non_score_reward": -1.145907998085022,
      "objective/rlhf_reward": -3.205029585448605,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 97.98136901855469,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.775390625,
      "step": 467,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9984302520751953
    },
    {
      "episode": 7504,
      "epoch": 0.044960515751758516,
      "loss/policy_avg": -0.09679757058620453,
      "lr": 9.700920245398774e-06,
      "objective/entropy": -44.23152160644531,
      "objective/kl": 34.52162170410156,
      "objective/non_score_reward": -1.726081132888794,
      "objective/rlhf_reward": -5.170991019407907,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 12.573694229125977,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.3896484375,
      "step": 468,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9995930194854736
    },
    {
      "episode": 7520,
      "epoch": 0.045056380390888065,
      "loss/policy_avg": 0.2740531265735626,
      "lr": 9.700281186094071e-06,
      "objective/entropy": -64.87997436523438,
      "objective/kl": 30.31191062927246,
      "objective/non_score_reward": -1.5155954360961914,
      "objective/rlhf_reward": -4.329048738876978,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 12.677139282226562,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.630859375,
      "step": 469,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9981954097747803
    },
    {
      "episode": 7536,
      "epoch": 0.045152245030017614,
      "loss/policy_avg": 0.4849107265472412,
      "lr": 9.699642126789368e-06,
      "objective/entropy": -136.48355102539062,
      "objective/kl": 20.618619918823242,
      "objective/non_score_reward": -1.030930995941162,
      "objective/rlhf_reward": -2.6998918845253863,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 95.56924438476562,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.607421875,
      "step": 470,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9975948333740234
    },
    {
      "episode": 7552,
      "epoch": 0.04524810966914716,
      "loss/policy_avg": 0.05032477527856827,
      "lr": 9.699003067484663e-06,
      "objective/entropy": -116.99330139160156,
      "objective/kl": 31.927814483642578,
      "objective/non_score_reward": -1.596390724182129,
      "objective/rlhf_reward": -5.026312672828121,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 1.1943883895874023,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.744140625,
      "step": 471,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0004005432128906
    },
    {
      "episode": 7568,
      "epoch": 0.04534397430827671,
      "loss/policy_avg": 0.23768550157546997,
      "lr": 9.69836400817996e-06,
      "objective/entropy": -56.441200256347656,
      "objective/kl": 35.956565856933594,
      "objective/non_score_reward": -1.7978280782699585,
      "objective/rlhf_reward": -5.587192330423909,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 18.25104522705078,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.75390625,
      "step": 472,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.001668930053711
    },
    {
      "episode": 7584,
      "epoch": 0.04543983894740626,
      "loss/policy_avg": 0.18428431451320648,
      "lr": 9.697724948875257e-06,
      "objective/entropy": -12.911811828613281,
      "objective/kl": 31.440038681030273,
      "objective/non_score_reward": -1.5720020532608032,
      "objective/rlhf_reward": -4.554674939314523,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 33.68145751953125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.673828125,
      "step": 473,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9997875690460205
    },
    {
      "episode": 7600,
      "epoch": 0.04553570358653581,
      "loss/policy_avg": 1.0267724990844727,
      "lr": 9.697085889570554e-06,
      "objective/entropy": -155.81759643554688,
      "objective/kl": 15.551814079284668,
      "objective/non_score_reward": -0.7775906920433044,
      "objective/rlhf_reward": -1.7317606593049586,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 3.7084851264953613,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.673828125,
      "step": 474,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9998857975006104
    },
    {
      "episode": 7616,
      "epoch": 0.04563156822566536,
      "loss/policy_avg": 0.5301028490066528,
      "lr": 9.69644683026585e-06,
      "objective/entropy": -186.65789794921875,
      "objective/kl": 37.16144561767578,
      "objective/non_score_reward": -1.858072280883789,
      "objective/rlhf_reward": -5.876029699054316,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 48.150047302246094,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.658203125,
      "step": 475,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9972370862960815
    },
    {
      "episode": 7632,
      "epoch": 0.04572743286479491,
      "loss/policy_avg": 0.2144310474395752,
      "lr": 9.695807770961146e-06,
      "objective/entropy": -153.16233825683594,
      "objective/kl": 31.742645263671875,
      "objective/non_score_reward": -1.5871323347091675,
      "objective/rlhf_reward": -4.832757556232151,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 43.260581970214844,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62109375,
      "step": 476,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.996619701385498
    },
    {
      "episode": 7648,
      "epoch": 0.04582329750392446,
      "loss/policy_avg": 0.1423683762550354,
      "lr": 9.695168711656443e-06,
      "objective/entropy": -101.34695434570312,
      "objective/kl": 34.40277099609375,
      "objective/non_score_reward": -1.7201385498046875,
      "objective/rlhf_reward": -5.555040988951845,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 6.133903503417969,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.740234375,
      "step": 477,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9991512298583984
    },
    {
      "episode": 7664,
      "epoch": 0.045919162143054006,
      "loss/policy_avg": -0.20567180216312408,
      "lr": 9.694529652351738e-06,
      "objective/entropy": 1.8477153778076172,
      "objective/kl": 34.25542068481445,
      "objective/non_score_reward": -1.7127711772918701,
      "objective/rlhf_reward": -5.451084411144256,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 90.96925354003906,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.79296875,
      "step": 478,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9978113174438477
    },
    {
      "episode": 7680,
      "epoch": 0.046015026782183555,
      "loss/policy_avg": 0.04285082221031189,
      "lr": 9.693890593047035e-06,
      "objective/entropy": -163.51800537109375,
      "objective/kl": 39.76237487792969,
      "objective/non_score_reward": -1.9881186485290527,
      "objective/rlhf_reward": -6.47152245324409,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 41.795677185058594,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.64453125,
      "step": 479,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9989352226257324
    },
    {
      "episode": 7696,
      "epoch": 0.046110891421313104,
      "loss/policy_avg": 0.30679094791412354,
      "lr": 9.693251533742331e-06,
      "objective/entropy": -137.21139526367188,
      "objective/kl": 24.817203521728516,
      "objective/non_score_reward": -1.2408602237701416,
      "objective/rlhf_reward": -3.407181172576502,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 7.010622024536133,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.740234375,
      "step": 480,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.998010516166687
    },
    {
      "episode": 7712,
      "epoch": 0.04620675606044265,
      "loss/policy_avg": 0.14935311675071716,
      "lr": 9.692612474437628e-06,
      "objective/entropy": -133.61581420898438,
      "objective/kl": 28.18117904663086,
      "objective/non_score_reward": -1.4090590476989746,
      "objective/rlhf_reward": -4.276986324523373,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 41.72409439086914,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.720703125,
      "step": 481,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9976277351379395
    },
    {
      "episode": 7728,
      "epoch": 0.0463026206995722,
      "loss/policy_avg": 0.4503282606601715,
      "lr": 9.691973415132925e-06,
      "objective/entropy": -185.92971801757812,
      "objective/kl": 24.44643783569336,
      "objective/non_score_reward": -1.22232186794281,
      "objective/rlhf_reward": -4.88928747177124,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 26.91709327697754,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.55078125,
      "step": 482,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9986282587051392
    },
    {
      "episode": 7744,
      "epoch": 0.04639848533870175,
      "loss/policy_avg": 0.7586182355880737,
      "lr": 9.691334355828222e-06,
      "objective/entropy": -136.83555603027344,
      "objective/kl": 27.66883087158203,
      "objective/non_score_reward": -1.38344144821167,
      "objective/rlhf_reward": -3.41105959035543,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 39.446250915527344,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4287109375,
      "step": 483,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9947845935821533
    },
    {
      "episode": 7760,
      "epoch": 0.0464943499778313,
      "loss/policy_avg": 0.47291696071624756,
      "lr": 9.690695296523517e-06,
      "objective/entropy": 10.135929107666016,
      "objective/kl": 31.171567916870117,
      "objective/non_score_reward": -1.558578372001648,
      "objective/rlhf_reward": -4.572453921259033,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 15.718633651733398,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.611328125,
      "step": 484,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.997343897819519
    },
    {
      "episode": 7776,
      "epoch": 0.04659021461696085,
      "loss/policy_avg": 0.19839856028556824,
      "lr": 9.690056237218814e-06,
      "objective/entropy": -64.7506332397461,
      "objective/kl": 25.45448112487793,
      "objective/non_score_reward": -1.2727241516113281,
      "objective/rlhf_reward": -2.690896427631378,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 29.054779052734375,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.76953125,
      "step": 485,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9977914094924927
    },
    {
      "episode": 7792,
      "epoch": 0.0466860792560904,
      "loss/policy_avg": 0.16692940890789032,
      "lr": 9.689417177914111e-06,
      "objective/entropy": -200.1573028564453,
      "objective/kl": 16.24359893798828,
      "objective/non_score_reward": -0.8121800422668457,
      "objective/rlhf_reward": -1.6446000672021683,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 3.7478506565093994,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.58984375,
      "step": 486,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9997503757476807
    },
    {
      "episode": 7808,
      "epoch": 0.046781943895219946,
      "loss/policy_avg": 0.20832450687885284,
      "lr": 9.688778118609408e-06,
      "objective/entropy": -229.8734893798828,
      "objective/kl": 24.610809326171875,
      "objective/non_score_reward": -1.2305405139923096,
      "objective/rlhf_reward": -3.3180417156854443,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 50.22547912597656,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62890625,
      "step": 487,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9977538585662842
    },
    {
      "episode": 7824,
      "epoch": 0.046877808534349495,
      "loss/policy_avg": 0.584824800491333,
      "lr": 9.688139059304705e-06,
      "objective/entropy": -159.94088745117188,
      "objective/kl": 32.78782653808594,
      "objective/non_score_reward": -1.6393911838531494,
      "objective/rlhf_reward": -5.041793072017368,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 53.52165985107422,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4560546875,
      "step": 488,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9974932670593262
    },
    {
      "episode": 7840,
      "epoch": 0.046973673173479044,
      "loss/policy_avg": 0.10657641291618347,
      "lr": 9.6875e-06,
      "objective/entropy": -117.46031188964844,
      "objective/kl": 22.680068969726562,
      "objective/non_score_reward": -1.1340034008026123,
      "objective/rlhf_reward": -2.802680269877116,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 31.437467575073242,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.640625,
      "step": 489,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9984140396118164
    },
    {
      "episode": 7856,
      "epoch": 0.0470695378126086,
      "loss/policy_avg": 0.05225694179534912,
      "lr": 9.686860940695297e-06,
      "objective/entropy": -102.69722747802734,
      "objective/kl": 35.890769958496094,
      "objective/non_score_reward": -1.7945387363433838,
      "objective/rlhf_reward": -2.7781547069549557,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 8.238727569580078,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.693359375,
      "step": 490,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.996579885482788
    },
    {
      "episode": 7872,
      "epoch": 0.04716540245173815,
      "loss/policy_avg": 0.3118276000022888,
      "lr": 9.686221881390594e-06,
      "objective/entropy": -42.73939895629883,
      "objective/kl": 22.486095428466797,
      "objective/non_score_reward": -1.1243047714233398,
      "objective/rlhf_reward": -3.0733869268494525,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 29.32803726196289,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.609375,
      "step": 491,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9991399049758911
    },
    {
      "episode": 7888,
      "epoch": 0.0472612670908677,
      "loss/policy_avg": 0.621738076210022,
      "lr": 9.68558282208589e-06,
      "objective/entropy": -26.77874755859375,
      "objective/kl": 33.77405548095703,
      "objective/non_score_reward": -1.688702940940857,
      "objective/rlhf_reward": -5.198552160468653,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 9.273128509521484,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.73046875,
      "step": 492,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9988332986831665
    },
    {
      "episode": 7904,
      "epoch": 0.04735713172999725,
      "loss/policy_avg": 0.16049662232398987,
      "lr": 9.684943762781188e-06,
      "objective/entropy": -84.04755401611328,
      "objective/kl": 25.384605407714844,
      "objective/non_score_reward": -1.2692303657531738,
      "objective/rlhf_reward": -2.1532023891222205,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 0.7223958373069763,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.646484375,
      "step": 493,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0005576610565186
    },
    {
      "episode": 7920,
      "epoch": 0.047452996369126796,
      "loss/policy_avg": 0.3413264751434326,
      "lr": 9.684304703476484e-06,
      "objective/entropy": -118.85188293457031,
      "objective/kl": 30.77880859375,
      "objective/non_score_reward": -1.5389404296875,
      "objective/rlhf_reward": -4.422428623835246,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 19.30898666381836,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.611328125,
      "step": 494,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.997934103012085
    },
    {
      "episode": 7936,
      "epoch": 0.047548861008256345,
      "loss/policy_avg": -0.016445789486169815,
      "lr": 9.68366564417178e-06,
      "objective/entropy": -211.39361572265625,
      "objective/kl": 26.587682723999023,
      "objective/non_score_reward": -1.3293840885162354,
      "objective/rlhf_reward": -2.917536354064941,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 50.449562072753906,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.5234375,
      "step": 495,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.99893319606781
    },
    {
      "episode": 7952,
      "epoch": 0.047644725647385894,
      "loss/policy_avg": -0.2565712034702301,
      "lr": 9.683026584867076e-06,
      "objective/entropy": -49.41560363769531,
      "objective/kl": 27.722068786621094,
      "objective/non_score_reward": -1.3861035108566284,
      "objective/rlhf_reward": -3.882554417074309,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 16.277629852294922,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.703125,
      "step": 496,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.003061294555664
    },
    {
      "episode": 7968,
      "epoch": 0.04774059028651544,
      "loss/policy_avg": 0.17001637816429138,
      "lr": 9.682387525562373e-06,
      "objective/entropy": -40.254676818847656,
      "objective/kl": 25.527742385864258,
      "objective/non_score_reward": -1.2763869762420654,
      "objective/rlhf_reward": -5.10554826259613,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 19.284744262695312,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6875,
      "step": 497,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9972081184387207
    },
    {
      "episode": 7984,
      "epoch": 0.04783645492564499,
      "loss/policy_avg": 0.08028728514909744,
      "lr": 9.68174846625767e-06,
      "objective/entropy": -23.79485321044922,
      "objective/kl": 23.14282989501953,
      "objective/non_score_reward": -1.1571415662765503,
      "objective/rlhf_reward": -4.628566324710846,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 25.781452178955078,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4580078125,
      "step": 498,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9980249404907227
    },
    {
      "episode": 8000,
      "epoch": 0.04793231956477454,
      "loss/policy_avg": 0.2174569070339203,
      "lr": 9.681109406952967e-06,
      "objective/entropy": -109.13389587402344,
      "objective/kl": 36.64985656738281,
      "objective/non_score_reward": -1.8324928283691406,
      "objective/rlhf_reward": -5.951369323817593,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 27.508981704711914,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.525390625,
      "step": 499,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.99745512008667
    },
    {
      "episode": 8016,
      "epoch": 0.04802818420390409,
      "loss/policy_avg": 0.13631635904312134,
      "lr": 9.680470347648262e-06,
      "objective/entropy": -99.519775390625,
      "objective/kl": 41.364810943603516,
      "objective/non_score_reward": -2.0682406425476074,
      "objective/rlhf_reward": -6.448134417804788,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 102.98858642578125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4970703125,
      "step": 500,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.998145341873169
    },
    {
      "episode": 8032,
      "epoch": 0.04812404884303364,
      "loss/policy_avg": 0.059351589530706406,
      "lr": 9.67983128834356e-06,
      "objective/entropy": -226.86756896972656,
      "objective/kl": 27.588150024414062,
      "objective/non_score_reward": -1.379407525062561,
      "objective/rlhf_reward": -4.001858436855015,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 6.536296844482422,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.65625,
      "step": 501,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9976544380187988
    },
    {
      "episode": 8048,
      "epoch": 0.04821991348216319,
      "loss/policy_avg": 0.5408469438552856,
      "lr": 9.679192229038854e-06,
      "objective/entropy": 4.518913269042969,
      "objective/kl": 37.552825927734375,
      "objective/non_score_reward": -1.8776414394378662,
      "objective/rlhf_reward": -5.777232364813486,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 8.410907745361328,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.787109375,
      "step": 502,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9991774559020996
    },
    {
      "episode": 8064,
      "epoch": 0.048315778121292736,
      "loss/policy_avg": 1.089150071144104,
      "lr": 9.678553169734151e-06,
      "objective/entropy": -70.22102355957031,
      "objective/kl": 36.886138916015625,
      "objective/non_score_reward": -1.8443071842193604,
      "objective/rlhf_reward": -5.254522027746711,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 17.696430206298828,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.662109375,
      "step": 503,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9944283962249756
    },
    {
      "episode": 8080,
      "epoch": 0.048411642760422285,
      "loss/policy_avg": 0.04815336689352989,
      "lr": 9.677914110429448e-06,
      "objective/entropy": -206.61251831054688,
      "objective/kl": 19.784542083740234,
      "objective/non_score_reward": -0.9892270565032959,
      "objective/rlhf_reward": -2.4411365626179538,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 10.987642288208008,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6328125,
      "step": 504,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9982428550720215
    },
    {
      "episode": 8096,
      "epoch": 0.048507507399551834,
      "loss/policy_avg": 0.4511667788028717,
      "lr": 9.677275051124745e-06,
      "objective/entropy": -44.11040496826172,
      "objective/kl": 32.054603576660156,
      "objective/non_score_reward": -1.6027300357818604,
      "objective/rlhf_reward": -4.8951483605229225,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 161.647705078125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.57421875,
      "step": 505,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9990203380584717
    },
    {
      "episode": 8112,
      "epoch": 0.04860337203868138,
      "loss/policy_avg": 0.43728113174438477,
      "lr": 9.676635991820042e-06,
      "objective/entropy": -167.46401977539062,
      "objective/kl": 25.358474731445312,
      "objective/non_score_reward": -1.2679238319396973,
      "objective/rlhf_reward": -5.071695148944855,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 6.505180358886719,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.67578125,
      "step": 506,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.999653697013855
    },
    {
      "episode": 8128,
      "epoch": 0.04869923667781093,
      "loss/policy_avg": 0.049704909324645996,
      "lr": 9.675996932515339e-06,
      "objective/entropy": -68.84889221191406,
      "objective/kl": 23.506563186645508,
      "objective/non_score_reward": -1.1753281354904175,
      "objective/rlhf_reward": -3.3227105523027003,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 3.8750853538513184,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.783203125,
      "step": 507,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.99936842918396
    },
    {
      "episode": 8144,
      "epoch": 0.04879510131694048,
      "loss/policy_avg": 0.23126532137393951,
      "lr": 9.675357873210634e-06,
      "objective/entropy": -193.32493591308594,
      "objective/kl": 30.975135803222656,
      "objective/non_score_reward": -1.5487568378448486,
      "objective/rlhf_reward": -4.072320940271888,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 30.721832275390625,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.599609375,
      "step": 508,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.994474172592163
    },
    {
      "episode": 8160,
      "epoch": 0.04889096595607003,
      "loss/policy_avg": 0.6136177778244019,
      "lr": 9.67471881390593e-06,
      "objective/entropy": 35.12611770629883,
      "objective/kl": 24.636138916015625,
      "objective/non_score_reward": -1.2318068742752075,
      "objective/rlhf_reward": -2.979816268162663,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 31.945526123046875,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.736328125,
      "step": 509,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.001587390899658
    },
    {
      "episode": 8176,
      "epoch": 0.04898683059519958,
      "loss/policy_avg": 0.07654842734336853,
      "lr": 9.674079754601228e-06,
      "objective/entropy": -218.7822265625,
      "objective/kl": 30.072967529296875,
      "objective/non_score_reward": -1.5036484003067017,
      "objective/rlhf_reward": -3.8918873689332347,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 42.21351623535156,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.763671875,
      "step": 510,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9961347579956055
    },
    {
      "episode": 8192,
      "epoch": 0.04908269523432913,
      "loss/policy_avg": 0.4642539322376251,
      "lr": 9.673440695296525e-06,
      "objective/entropy": -61.26002502441406,
      "objective/kl": 28.09502410888672,
      "objective/non_score_reward": -1.4047513008117676,
      "objective/rlhf_reward": -4.168407420726165,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 28.139495849609375,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.50390625,
      "step": 511,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9988558292388916
    },
    {
      "episode": 8208,
      "epoch": 0.04917855987345868,
      "loss/policy_avg": -0.1496490240097046,
      "lr": 9.672801635991821e-06,
      "objective/entropy": -237.9604034423828,
      "objective/kl": 24.80710220336914,
      "objective/non_score_reward": -1.2403552532196045,
      "objective/rlhf_reward": -3.5828184867776454,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 9.494747161865234,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.671875,
      "step": 512,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.000725507736206
    },
    {
      "episode": 8224,
      "epoch": 0.049274424512588226,
      "loss/policy_avg": -0.18209466338157654,
      "lr": 9.672162576687117e-06,
      "objective/entropy": -180.66116333007812,
      "objective/kl": 25.97962188720703,
      "objective/non_score_reward": -1.2989810705184937,
      "objective/rlhf_reward": -3.073217930571113,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 41.079193115234375,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.73046875,
      "step": 513,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.997527837753296
    },
    {
      "episode": 8240,
      "epoch": 0.049370289151717775,
      "loss/policy_avg": 0.3504701852798462,
      "lr": 9.671523517382413e-06,
      "objective/entropy": -98.80787658691406,
      "objective/kl": 26.576587677001953,
      "objective/non_score_reward": -1.3288295269012451,
      "objective/rlhf_reward": -0.9153180480003353,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 13.758487701416016,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6953125,
      "step": 514,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.998854160308838
    },
    {
      "episode": 8256,
      "epoch": 0.049466153790847324,
      "loss/policy_avg": 0.48611417412757874,
      "lr": 9.67088445807771e-06,
      "objective/entropy": -128.45774841308594,
      "objective/kl": 29.784334182739258,
      "objective/non_score_reward": -1.4892168045043945,
      "objective/rlhf_reward": -4.223533527056375,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 1.2566263675689697,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.681640625,
      "step": 515,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0001587867736816
    },
    {
      "episode": 8272,
      "epoch": 0.04956201842997687,
      "loss/policy_avg": -0.13057222962379456,
      "lr": 9.670245398773007e-06,
      "objective/entropy": -146.07781982421875,
      "objective/kl": 31.182106018066406,
      "objective/non_score_reward": -1.5591052770614624,
      "objective/rlhf_reward": -3.8364211082458493,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 15.76829719543457,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.576171875,
      "step": 516,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0000805854797363
    },
    {
      "episode": 8288,
      "epoch": 0.04965788306910642,
      "loss/policy_avg": 0.637583315372467,
      "lr": 9.669606339468304e-06,
      "objective/entropy": -144.37762451171875,
      "objective/kl": 27.648868560791016,
      "objective/non_score_reward": -1.3824436664581299,
      "objective/rlhf_reward": -4.0140026448094215,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 2.933715343475342,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.64453125,
      "step": 517,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9995014667510986
    },
    {
      "episode": 8304,
      "epoch": 0.04975374770823597,
      "loss/policy_avg": 0.23517751693725586,
      "lr": 9.668967280163601e-06,
      "objective/entropy": -130.0078125,
      "objective/kl": 26.889904022216797,
      "objective/non_score_reward": -1.344495415687561,
      "objective/rlhf_reward": -3.927383343787536,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 35.43697738647461,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.81640625,
      "step": 518,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9984192848205566
    },
    {
      "episode": 8320,
      "epoch": 0.04984961234736552,
      "loss/policy_avg": -0.05650443956255913,
      "lr": 9.668328220858896e-06,
      "objective/entropy": -214.1605682373047,
      "objective/kl": 21.148624420166016,
      "objective/non_score_reward": -1.0574312210083008,
      "objective/rlhf_reward": -2.673465876784876,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 18.935588836669922,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.599609375,
      "step": 519,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9993261098861694
    },
    {
      "episode": 8336,
      "epoch": 0.04994547698649507,
      "loss/policy_avg": -0.034447960555553436,
      "lr": 9.667689161554193e-06,
      "objective/entropy": -158.14088439941406,
      "objective/kl": 32.29146957397461,
      "objective/non_score_reward": -1.61457359790802,
      "objective/rlhf_reward": -4.902035086360529,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 6.876145362854004,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.58984375,
      "step": 520,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9993882179260254
    },
    {
      "episode": 8352,
      "epoch": 0.05004134162562462,
      "loss/policy_avg": -0.13744737207889557,
      "lr": 9.66705010224949e-06,
      "objective/entropy": -204.13546752929688,
      "objective/kl": 28.699504852294922,
      "objective/non_score_reward": -1.4349753856658936,
      "objective/rlhf_reward": -4.361299076167446,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 2.3828086853027344,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.66796875,
      "step": 521,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0017244815826416
    },
    {
      "episode": 8368,
      "epoch": 0.05013720626475417,
      "loss/policy_avg": 0.13512714207172394,
      "lr": 9.666411042944787e-06,
      "objective/entropy": -234.03375244140625,
      "objective/kl": 27.24090576171875,
      "objective/non_score_reward": -1.3620452880859375,
      "objective/rlhf_reward": -3.932409131320652,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 27.1795654296875,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.689453125,
      "step": 522,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999112844467163
    },
    {
      "episode": 8384,
      "epoch": 0.050233070903883716,
      "loss/policy_avg": -0.011349002830684185,
      "lr": 9.665771983640082e-06,
      "objective/entropy": -252.35935974121094,
      "objective/kl": 35.68749237060547,
      "objective/non_score_reward": -1.784374713897705,
      "objective/rlhf_reward": -5.7588963890946925,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 13.969385147094727,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.626953125,
      "step": 523,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 1.9982863664627075
    },
    {
      "episode": 8400,
      "epoch": 0.050328935543013265,
      "loss/policy_avg": 0.03610409051179886,
      "lr": 9.665132924335379e-06,
      "objective/entropy": -18.527732849121094,
      "objective/kl": 31.889944076538086,
      "objective/non_score_reward": -1.5944972038269043,
      "objective/rlhf_reward": -4.927390317530975,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 69.35887145996094,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.86328125,
      "step": 524,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999839425086975
    },
    {
      "episode": 8416,
      "epoch": 0.050424800182142814,
      "loss/policy_avg": 0.4427942633628845,
      "lr": 9.664493865030676e-06,
      "objective/entropy": -203.7809295654297,
      "objective/kl": 25.36702537536621,
      "objective/non_score_reward": -1.2683511972427368,
      "objective/rlhf_reward": -3.6495729281502642,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 22.38974380493164,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.828125,
      "step": 525,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9989018440246582
    },
    {
      "episode": 8432,
      "epoch": 0.05052066482127236,
      "loss/policy_avg": 1.6773953437805176,
      "lr": 9.663854805725971e-06,
      "objective/entropy": -146.93841552734375,
      "objective/kl": 37.069419860839844,
      "objective/non_score_reward": -1.853471040725708,
      "objective/rlhf_reward": -5.990052063663569,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 11.231493949890137,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.53515625,
      "step": 526,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9981741905212402
    },
    {
      "episode": 8448,
      "epoch": 0.05061652946040191,
      "loss/policy_avg": -0.08897572010755539,
      "lr": 9.663215746421268e-06,
      "objective/entropy": -158.65708923339844,
      "objective/kl": 23.60004997253418,
      "objective/non_score_reward": -1.1800025701522827,
      "objective/rlhf_reward": -3.394497547179384,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 10.824882507324219,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.6953125,
      "step": 527,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9995331764221191
    },
    {
      "episode": 8464,
      "epoch": 0.05071239409953146,
      "loss/policy_avg": 0.024341005831956863,
      "lr": 9.662576687116565e-06,
      "objective/entropy": -174.72035217285156,
      "objective/kl": 29.104461669921875,
      "objective/non_score_reward": -1.4552230834960938,
      "objective/rlhf_reward": -4.479256918936401,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 17.054231643676758,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.470703125,
      "step": 528,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.999741554260254
    },
    {
      "episode": 8480,
      "epoch": 0.05080825873866101,
      "loss/policy_avg": 0.257159948348999,
      "lr": 9.661937627811862e-06,
      "objective/entropy": -200.30184936523438,
      "objective/kl": 23.69171905517578,
      "objective/non_score_reward": -1.1845859289169312,
      "objective/rlhf_reward": -3.338343775272369,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 6.550008773803711,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.595703125,
      "step": 529,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9984736442565918
    },
    {
      "episode": 8496,
      "epoch": 0.05090412337779056,
      "loss/policy_avg": 0.4184650182723999,
      "lr": 9.661298568507158e-06,
      "objective/entropy": -344.7420959472656,
      "objective/kl": 24.219188690185547,
      "objective/non_score_reward": -1.2109594345092773,
      "objective/rlhf_reward": -3.4652354205525935,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 67.58980560302734,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.681640625,
      "step": 530,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9985125064849854
    },
    {
      "episode": 8512,
      "epoch": 0.05099998801692011,
      "loss/policy_avg": -0.0187949538230896,
      "lr": 9.660659509202455e-06,
      "objective/entropy": -14.01883316040039,
      "objective/kl": 29.49643325805664,
      "objective/non_score_reward": -1.47482168674469,
      "objective/rlhf_reward": -4.520684697715145,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 6.090343475341797,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.59765625,
      "step": 531,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0014190673828125
    },
    {
      "episode": 8528,
      "epoch": 0.051095852656049656,
      "loss/policy_avg": 0.5480527877807617,
      "lr": 9.66002044989775e-06,
      "objective/entropy": -169.82949829101562,
      "objective/kl": 34.57899475097656,
      "objective/non_score_reward": -1.728949785232544,
      "objective/rlhf_reward": -5.434846642430186,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 7.255028247833252,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.638671875,
      "step": 532,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9971615076065063
    },
    {
      "episode": 8544,
      "epoch": 0.051191717295179205,
      "loss/policy_avg": 0.2761814594268799,
      "lr": 9.659381390593047e-06,
      "objective/entropy": -100.77452850341797,
      "objective/kl": 36.835365295410156,
      "objective/non_score_reward": -1.8417682647705078,
      "objective/rlhf_reward": -6.007823192809505,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 50.438026428222656,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.625,
      "step": 533,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9985384941101074
    },
    {
      "episode": 8560,
      "epoch": 0.051287581934308754,
      "loss/policy_avg": 0.4119563698768616,
      "lr": 9.658742331288344e-06,
      "objective/entropy": -65.70556640625,
      "objective/kl": 29.577213287353516,
      "objective/non_score_reward": -1.47886061668396,
      "objective/rlhf_reward": -3.792736174837623,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 7.75493049621582,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4990234375,
      "step": 534,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.002697467803955
    },
    {
      "episode": 8576,
      "epoch": 0.0513834465734383,
      "loss/policy_avg": 0.12609338760375977,
      "lr": 9.658103271983641e-06,
      "objective/entropy": -150.71954345703125,
      "objective/kl": 28.952709197998047,
      "objective/non_score_reward": -1.447635531425476,
      "objective/rlhf_reward": -4.3905422449111935,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 34.924835205078125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.615234375,
      "step": 535,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0004661083221436
    },
    {
      "episode": 8592,
      "epoch": 0.05147931121256785,
      "loss/policy_avg": 0.014640828594565392,
      "lr": 9.657464212678938e-06,
      "objective/entropy": -37.74507141113281,
      "objective/kl": 25.910266876220703,
      "objective/non_score_reward": -1.295513391494751,
      "objective/rlhf_reward": -0.7820532083511349,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 2.0191965103149414,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.751953125,
      "step": 536,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0010244846343994
    },
    {
      "episode": 8608,
      "epoch": 0.0515751758516974,
      "loss/policy_avg": 0.04429921880364418,
      "lr": 9.656825153374235e-06,
      "objective/entropy": -26.176483154296875,
      "objective/kl": 32.8004264831543,
      "objective/non_score_reward": -1.6400213241577148,
      "objective/rlhf_reward": -4.826751814285913,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 42.128135681152344,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.908203125,
      "step": 537,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0013113021850586
    },
    {
      "episode": 8624,
      "epoch": 0.05167104049082695,
      "loss/policy_avg": 0.46547916531562805,
      "lr": 9.65618609406953e-06,
      "objective/entropy": 7.776313781738281,
      "objective/kl": 28.19791030883789,
      "objective/non_score_reward": -1.4098955392837524,
      "objective/rlhf_reward": -3.906248764197031,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 4.504173755645752,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.8828125,
      "step": 538,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.998396635055542
    },
    {
      "episode": 8640,
      "epoch": 0.0517669051299565,
      "loss/policy_avg": 0.0001214742660522461,
      "lr": 9.655547034764827e-06,
      "objective/entropy": -112.6850357055664,
      "objective/kl": 31.756372451782227,
      "objective/non_score_reward": -1.5878187417984009,
      "objective/rlhf_reward": -4.228568734900032,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 2.7504100799560547,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.546875,
      "step": 539,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0001778602600098
    },
    {
      "episode": 8656,
      "epoch": 0.05186276976908605,
      "loss/policy_avg": 0.41524794697761536,
      "lr": 9.654907975460124e-06,
      "objective/entropy": -135.01878356933594,
      "objective/kl": 23.119266510009766,
      "objective/non_score_reward": -1.1559633016586304,
      "objective/rlhf_reward": -3.0675939609676153,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 26.581480026245117,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.6015625,
      "step": 540,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9962246417999268
    },
    {
      "episode": 8672,
      "epoch": 0.0519586344082156,
      "loss/policy_avg": 0.3321428894996643,
      "lr": 9.65426891615542e-06,
      "objective/entropy": -5.44740104675293,
      "objective/kl": 39.89240264892578,
      "objective/non_score_reward": -1.9946203231811523,
      "objective/rlhf_reward": -7.97848105430603,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 67.52932739257812,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62890625,
      "step": 541,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9986319541931152
    },
    {
      "episode": 8688,
      "epoch": 0.052054499047345146,
      "loss/policy_avg": 0.22704890370368958,
      "lr": 9.653629856850718e-06,
      "objective/entropy": 23.631000518798828,
      "objective/kl": 22.43924331665039,
      "objective/non_score_reward": -1.121962308883667,
      "objective/rlhf_reward": -3.109246918050152,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 40.600868225097656,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.708984375,
      "step": 542,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0008883476257324
    },
    {
      "episode": 8704,
      "epoch": 0.052150363686474695,
      "loss/policy_avg": 0.6167892217636108,
      "lr": 9.652990797546013e-06,
      "objective/entropy": 8.02947998046875,
      "objective/kl": 34.78337478637695,
      "objective/non_score_reward": -1.739168643951416,
      "objective/rlhf_reward": -5.556674695014953,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 7.763035774230957,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.55859375,
      "step": 543,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9983458518981934
    },
    {
      "episode": 8720,
      "epoch": 0.052246228325604244,
      "loss/policy_avg": 0.1720658838748932,
      "lr": 9.65235173824131e-06,
      "objective/entropy": 0.5252876281738281,
      "objective/kl": 31.73941993713379,
      "objective/non_score_reward": -1.5869710445404053,
      "objective/rlhf_reward": -4.79162499209936,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 6.366281509399414,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.76953125,
      "step": 544,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9988775253295898
    },
    {
      "episode": 8736,
      "epoch": 0.05234209296473379,
      "loss/policy_avg": 0.07084909081459045,
      "lr": 9.651712678936605e-06,
      "objective/entropy": -50.734527587890625,
      "objective/kl": 24.657032012939453,
      "objective/non_score_reward": -1.2328516244888306,
      "objective/rlhf_reward": -3.1065776899185886,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 12.337860107421875,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.521484375,
      "step": 545,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9985425472259521
    },
    {
      "episode": 8752,
      "epoch": 0.05243795760386334,
      "loss/policy_avg": -0.053861357271671295,
      "lr": 9.651073619631902e-06,
      "objective/entropy": -242.29559326171875,
      "objective/kl": 21.178913116455078,
      "objective/non_score_reward": -1.058945655822754,
      "objective/rlhf_reward": -2.6316629386583146,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 23.818538665771484,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.62109375,
      "step": 546,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0112152099609375
    },
    {
      "episode": 8768,
      "epoch": 0.0525338222429929,
      "loss/policy_avg": -0.008508548140525818,
      "lr": 9.650434560327199e-06,
      "objective/entropy": -46.92424011230469,
      "objective/kl": 39.04132843017578,
      "objective/non_score_reward": -1.952066421508789,
      "objective/rlhf_reward": -6.429663398352963,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 15.27535629272461,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4677734375,
      "step": 547,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9982926845550537
    },
    {
      "episode": 8784,
      "epoch": 0.052629686882122446,
      "loss/policy_avg": 0.17654258012771606,
      "lr": 9.649795501022496e-06,
      "objective/entropy": -44.7242431640625,
      "objective/kl": 19.804813385009766,
      "objective/non_score_reward": -0.9902406930923462,
      "objective/rlhf_reward": -2.635450038939638,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 39.75682067871094,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.76953125,
      "step": 548,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.002223014831543
    },
    {
      "episode": 8800,
      "epoch": 0.052725551521251995,
      "loss/policy_avg": 0.46367156505584717,
      "lr": 9.649156441717792e-06,
      "objective/entropy": -132.18556213378906,
      "objective/kl": 38.18450927734375,
      "objective/non_score_reward": -1.909225344657898,
      "objective/rlhf_reward": -6.0327816343942455,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 24.263263702392578,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.7421875,
      "step": 549,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9974383115768433
    },
    {
      "episode": 8816,
      "epoch": 0.052821416160381544,
      "loss/policy_avg": 0.2747136950492859,
      "lr": 9.64851738241309e-06,
      "objective/entropy": -91.26388549804688,
      "objective/kl": 28.735111236572266,
      "objective/non_score_reward": -1.4367555379867554,
      "objective/rlhf_reward": -4.085162764013396,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 5.113122940063477,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.626953125,
      "step": 550,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000535011291504
    },
    {
      "episode": 8832,
      "epoch": 0.05291728079951109,
      "loss/policy_avg": 0.031243963167071342,
      "lr": 9.647878323108384e-06,
      "objective/entropy": -40.358192443847656,
      "objective/kl": 31.673667907714844,
      "objective/non_score_reward": -1.5836834907531738,
      "objective/rlhf_reward": -4.993098309546142,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 78.17581939697266,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.796875,
      "step": 551,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9989157915115356
    },
    {
      "episode": 8848,
      "epoch": 0.05301314543864064,
      "loss/policy_avg": -0.28017422556877136,
      "lr": 9.647239263803681e-06,
      "objective/entropy": -100.97856140136719,
      "objective/kl": 33.18678283691406,
      "objective/non_score_reward": -1.659339189529419,
      "objective/rlhf_reward": -6.637356638908386,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 6.006505012512207,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.572265625,
      "step": 552,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.003824234008789
    },
    {
      "episode": 8864,
      "epoch": 0.05310901007777019,
      "loss/policy_avg": 0.04892890527844429,
      "lr": 9.646600204498978e-06,
      "objective/entropy": -136.31918334960938,
      "objective/kl": 19.06879997253418,
      "objective/non_score_reward": -0.9534400105476379,
      "objective/rlhf_reward": -2.2575007369190008,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 1.5354987382888794,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.572265625,
      "step": 553,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.003046989440918
    },
    {
      "episode": 8880,
      "epoch": 0.05320487471689974,
      "loss/policy_avg": 0.1114959716796875,
      "lr": 9.645961145194275e-06,
      "objective/entropy": -125.14915466308594,
      "objective/kl": 41.65575408935547,
      "objective/non_score_reward": -2.0827877521514893,
      "objective/rlhf_reward": -6.383740137295659,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 12.4759521484375,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.66796875,
      "step": 554,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9973247051239014
    },
    {
      "episode": 8896,
      "epoch": 0.05330073935602929,
      "loss/policy_avg": 0.2784144878387451,
      "lr": 9.645322085889572e-06,
      "objective/entropy": -42.213340759277344,
      "objective/kl": 34.43170928955078,
      "objective/non_score_reward": -1.7215855121612549,
      "objective/rlhf_reward": -6.8863421976566315,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 37.5791015625,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.546875,
      "step": 555,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9974336624145508
    },
    {
      "episode": 8912,
      "epoch": 0.05339660399515884,
      "loss/policy_avg": -0.0683375895023346,
      "lr": 9.644683026584867e-06,
      "objective/entropy": -94.292724609375,
      "objective/kl": 29.925048828125,
      "objective/non_score_reward": -1.4962522983551025,
      "objective/rlhf_reward": -4.4287500669627935,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 0.9679741263389587,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.53125,
      "step": 556,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.002323627471924
    },
    {
      "episode": 8928,
      "epoch": 0.05349246863428839,
      "loss/policy_avg": 0.3528517484664917,
      "lr": 9.644043967280164e-06,
      "objective/entropy": 100.1601791381836,
      "objective/kl": 29.87194061279297,
      "objective/non_score_reward": -1.4935970306396484,
      "objective/rlhf_reward": -4.493435802872538,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 21.40321922302246,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.54296875,
      "step": 557,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.999911904335022
    },
    {
      "episode": 8944,
      "epoch": 0.053588333273417936,
      "loss/policy_avg": 0.15664523839950562,
      "lr": 9.643404907975461e-06,
      "objective/entropy": -163.13458251953125,
      "objective/kl": 43.485382080078125,
      "objective/non_score_reward": -2.174269199371338,
      "objective/rlhf_reward": -6.297076797485351,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 28.333932876586914,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.52734375,
      "step": 558,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9964426755905151
    },
    {
      "episode": 8960,
      "epoch": 0.053684197912547485,
      "loss/policy_avg": 0.6344835162162781,
      "lr": 9.642765848670758e-06,
      "objective/entropy": -252.752685546875,
      "objective/kl": 33.16960144042969,
      "objective/non_score_reward": -1.658479928970337,
      "objective/rlhf_reward": -5.255317785827023,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 52.37012481689453,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.771484375,
      "step": 559,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 9,
      "val/ratio": 2.0004310607910156
    },
    {
      "episode": 8976,
      "epoch": 0.053780062551677034,
      "loss/policy_avg": 0.19869406521320343,
      "lr": 9.642126789366055e-06,
      "objective/entropy": -50.086647033691406,
      "objective/kl": 30.926883697509766,
      "objective/non_score_reward": -1.5463443994522095,
      "objective/rlhf_reward": -4.629118292537287,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 26.995628356933594,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.626953125,
      "step": 560,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9950168132781982
    },
    {
      "episode": 8992,
      "epoch": 0.05387592719080658,
      "loss/policy_avg": -0.010918349027633667,
      "lr": 9.641487730061352e-06,
      "objective/entropy": -168.9771728515625,
      "objective/kl": 22.5106201171875,
      "objective/non_score_reward": -1.1255309581756592,
      "objective/rlhf_reward": -3.1604882984453733,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 20.162094116210938,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.763671875,
      "step": 561,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 9,
      "val/ratio": 2.001406669616699
    },
    {
      "episode": 9008,
      "epoch": 0.05397179182993613,
      "loss/policy_avg": 0.4963573217391968,
      "lr": 9.640848670756647e-06,
      "objective/entropy": -159.58302307128906,
      "objective/kl": 34.39787673950195,
      "objective/non_score_reward": -1.7198940515518188,
      "objective/rlhf_reward": -5.455743868549433,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 32.154441833496094,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.59375,
      "step": 562,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.99989652633667
    },
    {
      "episode": 9024,
      "epoch": 0.05406765646906568,
      "loss/policy_avg": 0.4512660503387451,
      "lr": 9.640209611451944e-06,
      "objective/entropy": -112.33628845214844,
      "objective/kl": 34.371681213378906,
      "objective/non_score_reward": -1.7185840606689453,
      "objective/rlhf_reward": -5.515086495612545,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 4.578237056732178,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.599609375,
      "step": 563,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9984909296035767
    },
    {
      "episode": 9040,
      "epoch": 0.05416352110819523,
      "loss/policy_avg": 0.08781366050243378,
      "lr": 9.63957055214724e-06,
      "objective/entropy": -39.49800491333008,
      "objective/kl": 33.1617431640625,
      "objective/non_score_reward": -1.6580872535705566,
      "objective/rlhf_reward": -4.232348775863647,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 4.19449520111084,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.677734375,
      "step": 564,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.000304698944092
    },
    {
      "episode": 9056,
      "epoch": 0.05425938574732478,
      "loss/policy_avg": 0.02701903134584427,
      "lr": 9.638931492842537e-06,
      "objective/entropy": -135.10118103027344,
      "objective/kl": 34.19304656982422,
      "objective/non_score_reward": -1.7096521854400635,
      "objective/rlhf_reward": -5.388010840030059,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 18.33478546142578,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.61328125,
      "step": 565,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.999070644378662
    },
    {
      "episode": 9072,
      "epoch": 0.05435525038645433,
      "loss/policy_avg": 0.2804332375526428,
      "lr": 9.638292433537834e-06,
      "objective/entropy": -100.01052856445312,
      "objective/kl": 28.388795852661133,
      "objective/non_score_reward": -1.4194397926330566,
      "objective/rlhf_reward": -5.677759170532227,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 7.587360382080078,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.68359375,
      "step": 566,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0005486011505127
    },
    {
      "episode": 9088,
      "epoch": 0.05445111502558388,
      "loss/policy_avg": 0.4314262866973877,
      "lr": 9.63765337423313e-06,
      "objective/entropy": -130.2495574951172,
      "objective/kl": 35.38700866699219,
      "objective/non_score_reward": -1.7693501710891724,
      "objective/rlhf_reward": -5.4155414156323545,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 44.93388366699219,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.75390625,
      "step": 567,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9968568086624146
    },
    {
      "episode": 9104,
      "epoch": 0.054546979664713426,
      "loss/policy_avg": 0.3399587869644165,
      "lr": 9.637014314928426e-06,
      "objective/entropy": -247.61073303222656,
      "objective/kl": 28.445119857788086,
      "objective/non_score_reward": -1.4222559928894043,
      "objective/rlhf_reward": -3.864195342334818,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 7.162724018096924,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.607421875,
      "step": 568,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9984673261642456
    },
    {
      "episode": 9120,
      "epoch": 0.054642844303842975,
      "loss/policy_avg": 0.5520263314247131,
      "lr": 9.636375255623721e-06,
      "objective/entropy": -97.92376708984375,
      "objective/kl": 26.055057525634766,
      "objective/non_score_reward": -1.30275297164917,
      "objective/rlhf_reward": -3.088305356279884,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 36.18694305419922,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.494140625,
      "step": 569,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0012941360473633
    },
    {
      "episode": 9136,
      "epoch": 0.054738708942972523,
      "loss/policy_avg": 0.09734541922807693,
      "lr": 9.635736196319018e-06,
      "objective/entropy": -196.53872680664062,
      "objective/kl": 23.71702003479004,
      "objective/non_score_reward": -1.185850977897644,
      "objective/rlhf_reward": -4.743403911590576,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 2.213500738143921,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.580078125,
      "step": 570,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9993475675582886
    },
    {
      "episode": 9152,
      "epoch": 0.05483457358210207,
      "loss/policy_avg": 0.4516823887825012,
      "lr": 9.635097137014315e-06,
      "objective/entropy": -126.11761474609375,
      "objective/kl": 28.336185455322266,
      "objective/non_score_reward": -1.4168094396591187,
      "objective/rlhf_reward": -1.2672375202178952,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 44.684326171875,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.775390625,
      "step": 571,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9990687370300293
    },
    {
      "episode": 9168,
      "epoch": 0.05493043822123162,
      "loss/policy_avg": 0.34894299507141113,
      "lr": 9.634458077709612e-06,
      "objective/entropy": -3.410472869873047,
      "objective/kl": 35.99509048461914,
      "objective/non_score_reward": -1.7997545003890991,
      "objective/rlhf_reward": -5.87350514891736,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 4.621858596801758,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.634765625,
      "step": 572,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0019896030426025
    },
    {
      "episode": 9184,
      "epoch": 0.05502630286036117,
      "loss/policy_avg": 0.1023169457912445,
      "lr": 9.633819018404909e-06,
      "objective/entropy": -180.73724365234375,
      "objective/kl": 24.693328857421875,
      "objective/non_score_reward": -1.2346664667129517,
      "objective/rlhf_reward": -3.5386658668518063,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 22.89309310913086,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.64453125,
      "step": 573,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9981255531311035
    },
    {
      "episode": 9200,
      "epoch": 0.05512216749949072,
      "loss/policy_avg": 0.2509443163871765,
      "lr": 9.633179959100206e-06,
      "objective/entropy": -268.43072509765625,
      "objective/kl": 28.437435150146484,
      "objective/non_score_reward": -1.4218716621398926,
      "objective/rlhf_reward": -4.131227611508921,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 60.228729248046875,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.78125,
      "step": 574,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.0000219345092773
    },
    {
      "episode": 9216,
      "epoch": 0.05521803213862027,
      "loss/policy_avg": -0.04683633893728256,
      "lr": 9.632540899795501e-06,
      "objective/entropy": -70.71329498291016,
      "objective/kl": 38.51101303100586,
      "objective/non_score_reward": -1.9255508184432983,
      "objective/rlhf_reward": -5.877374465736459,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 3.3532156944274902,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.615234375,
      "step": 575,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.000458002090454
    },
    {
      "episode": 9232,
      "epoch": 0.05531389677774982,
      "loss/policy_avg": 0.25571292638778687,
      "lr": 9.631901840490798e-06,
      "objective/entropy": -197.88787841796875,
      "objective/kl": 25.574037551879883,
      "objective/non_score_reward": -1.278701901435852,
      "objective/rlhf_reward": -3.3814741532007853,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 6.096738815307617,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.671875,
      "step": 576,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.001577854156494
    },
    {
      "episode": 9248,
      "epoch": 0.055409761416879366,
      "loss/policy_avg": 0.7064580917358398,
      "lr": 9.631262781186095e-06,
      "objective/entropy": -150.29953002929688,
      "objective/kl": 30.821884155273438,
      "objective/non_score_reward": -1.5410943031311035,
      "objective/rlhf_reward": -4.43104387919108,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 43.45115280151367,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.60546875,
      "step": 577,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9980316162109375
    },
    {
      "episode": 9264,
      "epoch": 0.055505626056008915,
      "loss/policy_avg": 0.20062510669231415,
      "lr": 9.630623721881392e-06,
      "objective/entropy": -158.88388061523438,
      "objective/kl": 28.73421859741211,
      "objective/non_score_reward": -1.4367109537124634,
      "objective/rlhf_reward": -4.346843814849853,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 12.110857963562012,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.69140625,
      "step": 578,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.998002290725708
    },
    {
      "episode": 9280,
      "epoch": 0.055601490695138464,
      "loss/policy_avg": 0.08450721949338913,
      "lr": 9.629984662576689e-06,
      "objective/entropy": -250.45445251464844,
      "objective/kl": 27.57752227783203,
      "objective/non_score_reward": -1.3788762092590332,
      "objective/rlhf_reward": -4.064906816096649,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 17.175188064575195,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.5859375,
      "step": 579,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.997262716293335
    },
    {
      "episode": 9296,
      "epoch": 0.05569735533426801,
      "loss/policy_avg": 0.41482874751091003,
      "lr": 9.629345603271984e-06,
      "objective/entropy": -177.06607055664062,
      "objective/kl": 29.43456268310547,
      "objective/non_score_reward": -1.4717282056808472,
      "objective/rlhf_reward": -2.9631939872514934,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 50.86977005004883,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.595703125,
      "step": 580,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0005576610565186
    },
    {
      "episode": 9312,
      "epoch": 0.05579321997339756,
      "loss/policy_avg": 0.20043331384658813,
      "lr": 9.62870654396728e-06,
      "objective/entropy": -224.79660034179688,
      "objective/kl": 23.171340942382812,
      "objective/non_score_reward": -1.1585670709609985,
      "objective/rlhf_reward": -2.6868569953011825,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 4.841948986053467,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.603515625,
      "step": 581,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.00144362449646
    },
    {
      "episode": 9328,
      "epoch": 0.05588908461252711,
      "loss/policy_avg": 0.28447139263153076,
      "lr": 9.628067484662578e-06,
      "objective/entropy": -44.1309814453125,
      "objective/kl": 42.387351989746094,
      "objective/non_score_reward": -2.1193675994873047,
      "objective/rlhf_reward": -7.151957724124117,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 20.72610092163086,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.419921875,
      "step": 582,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9971909523010254
    },
    {
      "episode": 9344,
      "epoch": 0.05598494925165666,
      "loss/policy_avg": 0.09533769637346268,
      "lr": 9.627428425357874e-06,
      "objective/entropy": -218.9058380126953,
      "objective/kl": 27.360652923583984,
      "objective/non_score_reward": -1.368032693862915,
      "objective/rlhf_reward": -4.021532396884307,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 11.28432846069336,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7421875,
      "step": 583,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.9974052906036377
    },
    {
      "episode": 9360,
      "epoch": 0.05608081389078621,
      "loss/policy_avg": 0.5065032243728638,
      "lr": 9.626789366053171e-06,
      "objective/entropy": -231.38427734375,
      "objective/kl": 32.08224105834961,
      "objective/non_score_reward": -1.604112148284912,
      "objective/rlhf_reward": -5.0748127012545154,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 40.948760986328125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.69921875,
      "step": 584,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9988150596618652
    },
    {
      "episode": 9376,
      "epoch": 0.05617667852991576,
      "loss/policy_avg": 0.6530688405036926,
      "lr": 9.626150306748468e-06,
      "objective/entropy": -116.65798950195312,
      "objective/kl": 31.407730102539062,
      "objective/non_score_reward": -1.570386528968811,
      "objective/rlhf_reward": -4.902944007006985,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 13.348186492919922,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.54296875,
      "step": 585,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000188112258911
    },
    {
      "episode": 9392,
      "epoch": 0.05627254316904531,
      "loss/policy_avg": -0.06093317270278931,
      "lr": 9.625511247443763e-06,
      "objective/entropy": -245.7208251953125,
      "objective/kl": 22.28873634338379,
      "objective/non_score_reward": -1.1144368648529053,
      "objective/rlhf_reward": -2.33504098869947,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 1.7080774307250977,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.681640625,
      "step": 586,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.002075672149658
    },
    {
      "episode": 9408,
      "epoch": 0.056368407808174856,
      "loss/policy_avg": 0.4493389129638672,
      "lr": 9.62487218813906e-06,
      "objective/entropy": -11.156410217285156,
      "objective/kl": 29.71312141418457,
      "objective/non_score_reward": -1.4856561422348022,
      "objective/rlhf_reward": -4.117795641693186,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 18.012893676757812,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.759765625,
      "step": 587,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000352382659912
    },
    {
      "episode": 9424,
      "epoch": 0.056464272447304405,
      "loss/policy_avg": 0.3274408280849457,
      "lr": 9.624233128834357e-06,
      "objective/entropy": -116.3506088256836,
      "objective/kl": 35.94437026977539,
      "objective/non_score_reward": -1.7972185611724854,
      "objective/rlhf_reward": -4.788874185085296,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 17.158645629882812,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.69140625,
      "step": 588,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9996552467346191
    },
    {
      "episode": 9440,
      "epoch": 0.056560137086433954,
      "loss/policy_avg": 0.879096508026123,
      "lr": 9.623594069529654e-06,
      "objective/entropy": -152.50155639648438,
      "objective/kl": 32.464576721191406,
      "objective/non_score_reward": -1.623228669166565,
      "objective/rlhf_reward": -5.069082756240931,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 70.49058532714844,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.849609375,
      "step": 589,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.001586437225342
    },
    {
      "episode": 9456,
      "epoch": 0.0566560017255635,
      "loss/policy_avg": 0.2921786904335022,
      "lr": 9.62295501022495e-06,
      "objective/entropy": -177.27088928222656,
      "objective/kl": 39.783531188964844,
      "objective/non_score_reward": -1.989176630973816,
      "objective/rlhf_reward": -6.57810423621307,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 77.26689147949219,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.6796875,
      "step": 590,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9989970922470093
    },
    {
      "episode": 9472,
      "epoch": 0.05675186636469305,
      "loss/policy_avg": 0.3912142515182495,
      "lr": 9.622315950920246e-06,
      "objective/entropy": -120.1540756225586,
      "objective/kl": 31.21270179748535,
      "objective/non_score_reward": -1.5606350898742676,
      "objective/rlhf_reward": -3.842540299892425,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 25.256790161132812,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.654296875,
      "step": 591,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9980988502502441
    },
    {
      "episode": 9488,
      "epoch": 0.0568477310038226,
      "loss/policy_avg": 0.04369340091943741,
      "lr": 9.621676891615543e-06,
      "objective/entropy": -277.40753173828125,
      "objective/kl": 29.685585021972656,
      "objective/non_score_reward": -1.4842792749404907,
      "objective/rlhf_reward": -1.5371170997619625,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 7.890674591064453,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.677734375,
      "step": 592,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9981930255889893
    },
    {
      "episode": 9504,
      "epoch": 0.05694359564295215,
      "loss/policy_avg": 0.05721379816532135,
      "lr": 9.621037832310838e-06,
      "objective/entropy": -257.69232177734375,
      "objective/kl": 23.966060638427734,
      "objective/non_score_reward": -1.19830322265625,
      "objective/rlhf_reward": -3.0598793412248293,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 20.133102416992188,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.732421875,
      "step": 593,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.001232624053955
    },
    {
      "episode": 9520,
      "epoch": 0.0570394602820817,
      "loss/policy_avg": 0.5772296786308289,
      "lr": 9.620398773006135e-06,
      "objective/entropy": -89.6330795288086,
      "objective/kl": 31.078372955322266,
      "objective/non_score_reward": -1.5539186000823975,
      "objective/rlhf_reward": -4.734722021038889,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 21.1763916015625,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.87109375,
      "step": 594,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.000513792037964
    },
    {
      "episode": 9536,
      "epoch": 0.05713532492121125,
      "loss/policy_avg": -0.026315592229366302,
      "lr": 9.619759713701432e-06,
      "objective/entropy": -219.30979919433594,
      "objective/kl": 26.461135864257812,
      "objective/non_score_reward": -1.323056697845459,
      "objective/rlhf_reward": -3.9329772827371787,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 8.585318565368652,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.677734375,
      "step": 595,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0008788108825684
    },
    {
      "episode": 9552,
      "epoch": 0.057231189560340796,
      "loss/policy_avg": 0.2548080384731293,
      "lr": 9.619120654396729e-06,
      "objective/entropy": -37.27716827392578,
      "objective/kl": 44.03446960449219,
      "objective/non_score_reward": -2.201723575592041,
      "objective/rlhf_reward": -7.356295923800811,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 21.06201934814453,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.63671875,
      "step": 596,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9991514682769775
    },
    {
      "episode": 9568,
      "epoch": 0.057327054199470345,
      "loss/policy_avg": 2.5911049842834473,
      "lr": 9.618481595092026e-06,
      "objective/entropy": -171.7782745361328,
      "objective/kl": 20.800029754638672,
      "objective/non_score_reward": -1.0400015115737915,
      "objective/rlhf_reward": -1.760006046295166,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 2.9469943046569824,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6484375,
      "step": 597,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.014529228210449
    },
    {
      "episode": 9584,
      "epoch": 0.057422918838599894,
      "loss/policy_avg": -0.1166892945766449,
      "lr": 9.617842535787323e-06,
      "objective/entropy": -109.67333221435547,
      "objective/kl": 34.37934494018555,
      "objective/non_score_reward": -1.7189671993255615,
      "objective/rlhf_reward": -6.8758686780929565,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 17.377391815185547,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.4794921875,
      "step": 598,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0002119541168213
    },
    {
      "episode": 9600,
      "epoch": 0.05751878347772944,
      "loss/policy_avg": -0.15396325290203094,
      "lr": 9.617203476482618e-06,
      "objective/entropy": -128.05728149414062,
      "objective/kl": 29.42688751220703,
      "objective/non_score_reward": -1.4713443517684937,
      "objective/rlhf_reward": -4.060548658641886,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 2.408236026763916,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.71484375,
      "step": 599,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.002739429473877
    },
    {
      "episode": 9616,
      "epoch": 0.05761464811685899,
      "loss/policy_avg": 0.14407247304916382,
      "lr": 9.616564417177915e-06,
      "objective/entropy": -272.3529357910156,
      "objective/kl": 21.596874237060547,
      "objective/non_score_reward": -1.0798437595367432,
      "objective/rlhf_reward": -1.3956560238611426,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 7.104412078857422,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.58984375,
      "step": 600,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.001608371734619
    },
    {
      "episode": 9632,
      "epoch": 0.05771051275598854,
      "loss/policy_avg": 0.20445303618907928,
      "lr": 9.615925357873211e-06,
      "objective/entropy": -291.0384521484375,
      "objective/kl": 28.06856918334961,
      "objective/non_score_reward": -1.403428554534912,
      "objective/rlhf_reward": -4.235111692038876,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 9.333198547363281,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.720703125,
      "step": 601,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0009381771087646
    },
    {
      "episode": 9648,
      "epoch": 0.05780637739511809,
      "loss/policy_avg": 0.7656448483467102,
      "lr": 9.615286298568508e-06,
      "objective/entropy": -4.355806350708008,
      "objective/kl": 34.863006591796875,
      "objective/non_score_reward": -1.7431503534317017,
      "objective/rlhf_reward": -5.548769433696833,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 10.645190238952637,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.591796875,
      "step": 602,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9971041679382324
    },
    {
      "episode": 9664,
      "epoch": 0.05790224203424764,
      "loss/policy_avg": 0.1100698709487915,
      "lr": 9.614647239263805e-06,
      "objective/entropy": -203.49618530273438,
      "objective/kl": 19.046649932861328,
      "objective/non_score_reward": -0.9523325562477112,
      "objective/rlhf_reward": -2.4307281161225855,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 0.499467670917511,
      "policy/clipfrac_avg": 0.25,
      "policy/entropy_avg": 0.638671875,
      "step": 603,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0019733905792236
    },
    {
      "episode": 9680,
      "epoch": 0.05799810667337719,
      "loss/policy_avg": 0.17878472805023193,
      "lr": 9.6140081799591e-06,
      "objective/entropy": -162.996826171875,
      "objective/kl": 23.458127975463867,
      "objective/non_score_reward": -1.172906517982483,
      "objective/rlhf_reward": -3.3661131596862504,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 8.434497833251953,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.5390625,
      "step": 604,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9986371994018555
    },
    {
      "episode": 9696,
      "epoch": 0.058093971312506744,
      "loss/policy_avg": 0.5608217716217041,
      "lr": 9.613369120654397e-06,
      "objective/entropy": -168.91802978515625,
      "objective/kl": 31.90495491027832,
      "objective/non_score_reward": -1.5952478647232056,
      "objective/rlhf_reward": -3.4572724446069927,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 10.658321380615234,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.57421875,
      "step": 605,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999322772026062
    },
    {
      "episode": 9712,
      "epoch": 0.05818983595163629,
      "loss/policy_avg": 0.10194225609302521,
      "lr": 9.612730061349694e-06,
      "objective/entropy": -138.00286865234375,
      "objective/kl": 34.8355712890625,
      "objective/non_score_reward": -1.7417783737182617,
      "objective/rlhf_reward": -5.641600999861879,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 19.823665618896484,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.765625,
      "step": 606,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.000347852706909
    },
    {
      "episode": 9728,
      "epoch": 0.05828570059076584,
      "loss/policy_avg": 1.170401930809021,
      "lr": 9.612091002044991e-06,
      "objective/entropy": -171.179443359375,
      "objective/kl": 23.883764266967773,
      "objective/non_score_reward": -1.1941882371902466,
      "objective/rlhf_reward": -3.2609813449704017,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 1.674392819404602,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.654296875,
      "step": 607,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0004193782806396
    },
    {
      "episode": 9744,
      "epoch": 0.05838156522989539,
      "loss/policy_avg": 0.05054464191198349,
      "lr": 9.611451942740288e-06,
      "objective/entropy": -196.56436157226562,
      "objective/kl": 23.218883514404297,
      "objective/non_score_reward": -1.1609442234039307,
      "objective/rlhf_reward": -3.1931789918855276,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 11.145727157592773,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.60546875,
      "step": 608,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.998692512512207
    },
    {
      "episode": 9760,
      "epoch": 0.05847742986902494,
      "loss/policy_avg": 0.054385945200920105,
      "lr": 9.610812883435585e-06,
      "objective/entropy": -244.93141174316406,
      "objective/kl": 29.985477447509766,
      "objective/non_score_reward": -1.4992740154266357,
      "objective/rlhf_reward": -4.637845957015438,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 19.703460693359375,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.69140625,
      "step": 609,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 2.0000925064086914
    },
    {
      "episode": 9776,
      "epoch": 0.05857329450815449,
      "loss/policy_avg": -0.05685323104262352,
      "lr": 9.61017382413088e-06,
      "objective/entropy": -65.63417053222656,
      "objective/kl": 31.53623390197754,
      "objective/non_score_reward": -1.5768117904663086,
      "objective/rlhf_reward": -3.383528147579405,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 12.860790252685547,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.59375,
      "step": 610,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.001614809036255
    },
    {
      "episode": 9792,
      "epoch": 0.05866915914728404,
      "loss/policy_avg": 0.20876801013946533,
      "lr": 9.609534764826177e-06,
      "objective/entropy": -112.53227996826172,
      "objective/kl": 41.12568664550781,
      "objective/non_score_reward": -2.0562844276428223,
      "objective/rlhf_reward": -5.825137710571289,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 33.385337829589844,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.626953125,
      "step": 611,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000166654586792
    },
    {
      "episode": 9808,
      "epoch": 0.05876502378641359,
      "loss/policy_avg": 0.2722185552120209,
      "lr": 9.608895705521472e-06,
      "objective/entropy": -124.71205139160156,
      "objective/kl": 38.9796257019043,
      "objective/non_score_reward": -1.9489812850952148,
      "objective/rlhf_reward": -5.395925498008728,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 19.52260971069336,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.623046875,
      "step": 612,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9988607168197632
    },
    {
      "episode": 9824,
      "epoch": 0.058860888425543136,
      "loss/policy_avg": 0.7936792969703674,
      "lr": 9.608256646216769e-06,
      "objective/entropy": -150.9628448486328,
      "objective/kl": 32.946922302246094,
      "objective/non_score_reward": -1.6473462581634521,
      "objective/rlhf_reward": -5.165553171833125,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 23.228769302368164,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.537109375,
      "step": 613,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0001022815704346
    },
    {
      "episode": 9840,
      "epoch": 0.058956753064672685,
      "loss/policy_avg": 0.8288295269012451,
      "lr": 9.607617586912066e-06,
      "objective/entropy": -145.37136840820312,
      "objective/kl": 37.17048645019531,
      "objective/non_score_reward": -1.8585245609283447,
      "objective/rlhf_reward": -5.6092691376534205,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 8.95422077178955,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.568359375,
      "step": 614,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9995349645614624
    },
    {
      "episode": 9856,
      "epoch": 0.05905261770380223,
      "loss/policy_avg": 0.19199243187904358,
      "lr": 9.606978527607363e-06,
      "objective/entropy": -158.26043701171875,
      "objective/kl": 31.016521453857422,
      "objective/non_score_reward": -1.550826072692871,
      "objective/rlhf_reward": -4.8440544244989585,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 1.4004452228546143,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5546875,
      "step": 615,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.00236439704895
    },
    {
      "episode": 9872,
      "epoch": 0.05914848234293178,
      "loss/policy_avg": 0.29752206802368164,
      "lr": 9.60633946830266e-06,
      "objective/entropy": -141.43800354003906,
      "objective/kl": 27.8808536529541,
      "objective/non_score_reward": -1.394042730331421,
      "objective/rlhf_reward": -3.842837558190028,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 10.629474639892578,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.5078125,
      "step": 616,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.00028657913208
    },
    {
      "episode": 9888,
      "epoch": 0.05924434698206133,
      "loss/policy_avg": 0.2227097749710083,
      "lr": 9.605700408997955e-06,
      "objective/entropy": -97.0810775756836,
      "objective/kl": 34.3601188659668,
      "objective/non_score_reward": -1.718005895614624,
      "objective/rlhf_reward": -5.4481916024285235,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 16.432331085205078,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.548828125,
      "step": 617,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9975783824920654
    },
    {
      "episode": 9904,
      "epoch": 0.05934021162119088,
      "loss/policy_avg": 0.17975842952728271,
      "lr": 9.605061349693252e-06,
      "objective/entropy": -200.100830078125,
      "objective/kl": 28.51620864868164,
      "objective/non_score_reward": -1.4258103370666504,
      "objective/rlhf_reward": -3.8784127190438022,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 10.591612815856934,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.658203125,
      "step": 618,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0000221729278564
    },
    {
      "episode": 9920,
      "epoch": 0.05943607626032043,
      "loss/policy_avg": 0.4452857971191406,
      "lr": 9.604422290388548e-06,
      "objective/entropy": -87.9361572265625,
      "objective/kl": 34.174217224121094,
      "objective/non_score_reward": -1.7087109088897705,
      "objective/rlhf_reward": -5.278584449496821,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 24.203800201416016,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.587890625,
      "step": 619,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9989922046661377
    },
    {
      "episode": 9936,
      "epoch": 0.05953194089944998,
      "loss/policy_avg": 0.31785786151885986,
      "lr": 9.603783231083845e-06,
      "objective/entropy": -56.93491744995117,
      "objective/kl": 34.28547286987305,
      "objective/non_score_reward": -1.7142736911773682,
      "objective/rlhf_reward": -5.032265897068094,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 12.636474609375,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.85546875,
      "step": 620,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.996368408203125
    },
    {
      "episode": 9952,
      "epoch": 0.05962780553857953,
      "loss/policy_avg": 0.6350647211074829,
      "lr": 9.603144171779142e-06,
      "objective/entropy": -129.3587188720703,
      "objective/kl": 41.710655212402344,
      "objective/non_score_reward": -2.0855326652526855,
      "objective/rlhf_reward": -6.219424667135749,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 11.748146057128906,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4775390625,
      "step": 621,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9979774951934814
    },
    {
      "episode": 9968,
      "epoch": 0.059723670177709076,
      "loss/policy_avg": 0.9843254089355469,
      "lr": 9.602505112474439e-06,
      "objective/entropy": -95.34288024902344,
      "objective/kl": 49.37370300292969,
      "objective/non_score_reward": -2.4686851501464844,
      "objective/rlhf_reward": -8.049912209781716,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 31.02006721496582,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.4970703125,
      "step": 622,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9965976476669312
    },
    {
      "episode": 9984,
      "epoch": 0.059819534816838625,
      "loss/policy_avg": 0.6165390610694885,
      "lr": 9.601866053169734e-06,
      "objective/entropy": -100.56966400146484,
      "objective/kl": 33.22990036010742,
      "objective/non_score_reward": -1.6614950895309448,
      "objective/rlhf_reward": -5.286730491851253,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 13.85442066192627,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.61328125,
      "step": 623,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9971177577972412
    },
    {
      "episode": 10000,
      "epoch": 0.059915399455968174,
      "loss/policy_avg": 0.3318287134170532,
      "lr": 9.601226993865031e-06,
      "objective/entropy": -212.1555938720703,
      "objective/kl": 25.822668075561523,
      "objective/non_score_reward": -1.2911334037780762,
      "objective/rlhf_reward": -2.2408145412218303,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 1.2788864374160767,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.525390625,
      "step": 624,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9995461702346802
    },
    {
      "episode": 10016,
      "epoch": 0.06001126409509772,
      "loss/policy_avg": 0.35671815276145935,
      "lr": 9.600587934560328e-06,
      "objective/entropy": -96.60403442382812,
      "objective/kl": 42.28247833251953,
      "objective/non_score_reward": -2.114124059677124,
      "objective/rlhf_reward": -6.6316679671135645,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 9.525958061218262,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.55859375,
      "step": 625,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.999858021736145
    },
    {
      "episode": 10032,
      "epoch": 0.06010712873422727,
      "loss/policy_avg": 0.026430530473589897,
      "lr": 9.599948875255625e-06,
      "objective/entropy": -96.45112609863281,
      "objective/kl": 30.055763244628906,
      "objective/non_score_reward": -1.5027881860733032,
      "objective/rlhf_reward": -4.56055448493515,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 6.234503746032715,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.521484375,
      "step": 626,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.002078056335449
    },
    {
      "episode": 10048,
      "epoch": 0.06020299337335682,
      "loss/policy_avg": -0.07770150899887085,
      "lr": 9.599309815950922e-06,
      "objective/entropy": -78.50785827636719,
      "objective/kl": 33.19765090942383,
      "objective/non_score_reward": -1.6598827838897705,
      "objective/rlhf_reward": -5.158578279431223,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 60.745849609375,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.5546875,
      "step": 627,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0026586055755615
    },
    {
      "episode": 10064,
      "epoch": 0.06029885801248637,
      "loss/policy_avg": 0.045525066554546356,
      "lr": 9.598670756646217e-06,
      "objective/entropy": -207.98727416992188,
      "objective/kl": 34.44676208496094,
      "objective/non_score_reward": -1.7223379611968994,
      "objective/rlhf_reward": -5.489351963996887,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 2.952592372894287,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.671875,
      "step": 628,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9989871978759766
    },
    {
      "episode": 10080,
      "epoch": 0.06039472265161592,
      "loss/policy_avg": 0.32521092891693115,
      "lr": 9.598031697341514e-06,
      "objective/entropy": -71.00718688964844,
      "objective/kl": 27.00582504272461,
      "objective/non_score_reward": -1.3502912521362305,
      "objective/rlhf_reward": -3.977332849701015,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 5.865281105041504,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7578125,
      "step": 629,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.001133918762207
    },
    {
      "episode": 10096,
      "epoch": 0.06049058729074547,
      "loss/policy_avg": 0.22257700562477112,
      "lr": 9.59739263803681e-06,
      "objective/entropy": -87.40052795410156,
      "objective/kl": 31.356922149658203,
      "objective/non_score_reward": -1.5678460597991943,
      "objective/rlhf_reward": -4.32397324867719,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 27.549453735351562,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.546875,
      "step": 630,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.999577522277832
    },
    {
      "episode": 10112,
      "epoch": 0.06058645192987502,
      "loss/policy_avg": 0.4591647982597351,
      "lr": 9.596753578732108e-06,
      "objective/entropy": -35.01010513305664,
      "objective/kl": 28.93059539794922,
      "objective/non_score_reward": -1.4465298652648926,
      "objective/rlhf_reward": -4.42686941597311,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 10.006196975708008,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.8203125,
      "step": 631,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9968595504760742
    },
    {
      "episode": 10128,
      "epoch": 0.060682316569004566,
      "loss/policy_avg": 0.9483177661895752,
      "lr": 9.596114519427405e-06,
      "objective/entropy": -152.91030883789062,
      "objective/kl": 30.360069274902344,
      "objective/non_score_reward": -1.5180034637451172,
      "objective/rlhf_reward": -4.338680283228555,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 15.410400390625,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.599609375,
      "step": 632,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9943327903747559
    },
    {
      "episode": 10144,
      "epoch": 0.060778181208134115,
      "loss/policy_avg": 0.4167541265487671,
      "lr": 9.595475460122701e-06,
      "objective/entropy": -154.04684448242188,
      "objective/kl": 33.39550018310547,
      "objective/non_score_reward": -1.6697750091552734,
      "objective/rlhf_reward": -5.074980471197682,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 53.406578063964844,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.576171875,
      "step": 633,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9963878393173218
    },
    {
      "episode": 10160,
      "epoch": 0.060874045847263664,
      "loss/policy_avg": -0.021846026182174683,
      "lr": 9.594836400817997e-06,
      "objective/entropy": -22.81509780883789,
      "objective/kl": 23.709880828857422,
      "objective/non_score_reward": -1.1854941844940186,
      "objective/rlhf_reward": -2.917147810730051,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 0.839837908744812,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.6953125,
      "step": 634,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000304937362671
    },
    {
      "episode": 10176,
      "epoch": 0.06096991048639321,
      "loss/policy_avg": 0.014755940064787865,
      "lr": 9.594197341513293e-06,
      "objective/entropy": -198.07839965820312,
      "objective/kl": 21.79191017150879,
      "objective/non_score_reward": -1.0895954370498657,
      "objective/rlhf_reward": -1.9583818078041078,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 0.6484163999557495,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.767578125,
      "step": 635,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0002074241638184
    },
    {
      "episode": 10192,
      "epoch": 0.06106577512552276,
      "loss/policy_avg": 0.13533297181129456,
      "lr": 9.593558282208589e-06,
      "objective/entropy": -201.26246643066406,
      "objective/kl": 26.135250091552734,
      "objective/non_score_reward": -1.3067626953125,
      "objective/rlhf_reward": -3.885414889364868,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 11.92165756225586,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.740234375,
      "step": 636,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9993889331817627
    },
    {
      "episode": 10208,
      "epoch": 0.06116163976465231,
      "loss/policy_avg": 0.4021642506122589,
      "lr": 9.592919222903886e-06,
      "objective/entropy": -286.0339050292969,
      "objective/kl": 14.542181968688965,
      "objective/non_score_reward": -0.7271090745925903,
      "objective/rlhf_reward": -1.484604258735744,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 5.031335353851318,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.701171875,
      "step": 637,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.003591775894165
    },
    {
      "episode": 10224,
      "epoch": 0.06125750440378186,
      "loss/policy_avg": 0.2514651417732239,
      "lr": 9.592280163599182e-06,
      "objective/entropy": -132.75355529785156,
      "objective/kl": 25.25128173828125,
      "objective/non_score_reward": -1.2625641822814941,
      "objective/rlhf_reward": -3.5996581717446894,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 14.74315071105957,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.712890625,
      "step": 638,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.000126361846924
    },
    {
      "episode": 10240,
      "epoch": 0.06135336904291141,
      "loss/policy_avg": 0.012995198369026184,
      "lr": 9.59164110429448e-06,
      "objective/entropy": -181.2290496826172,
      "objective/kl": 22.253154754638672,
      "objective/non_score_reward": -1.1126577854156494,
      "objective/rlhf_reward": -3.026798923214046,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 0.9591898918151855,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.53515625,
      "step": 639,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9993176460266113
    },
    {
      "episode": 10256,
      "epoch": 0.06144923368204096,
      "loss/policy_avg": 0.15271592140197754,
      "lr": 9.591002044989776e-06,
      "objective/entropy": -105.57412719726562,
      "objective/kl": 38.59171676635742,
      "objective/non_score_reward": -1.9295859336853027,
      "objective/rlhf_reward": -6.16208431026037,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 6.626259803771973,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.734375,
      "step": 640,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.996435284614563
    },
    {
      "episode": 10272,
      "epoch": 0.061545098321170506,
      "loss/policy_avg": -0.11524446308612823,
      "lr": 9.590362985685071e-06,
      "objective/entropy": -123.53447723388672,
      "objective/kl": 26.7266845703125,
      "objective/non_score_reward": -1.336334228515625,
      "objective/rlhf_reward": -3.222630920187507,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 1.8472533226013184,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.46484375,
      "step": 641,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.011606216430664
    },
    {
      "episode": 10288,
      "epoch": 0.061640962960300055,
      "loss/policy_avg": 0.4013972282409668,
      "lr": 9.589723926380368e-06,
      "objective/entropy": -128.90103149414062,
      "objective/kl": 31.007064819335938,
      "objective/non_score_reward": -1.5503532886505127,
      "objective/rlhf_reward": -4.685641431602177,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 6.671117782592773,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.556640625,
      "step": 642,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9970167875289917
    },
    {
      "episode": 10304,
      "epoch": 0.061736827599429604,
      "loss/policy_avg": 0.7907944321632385,
      "lr": 9.589084867075665e-06,
      "objective/entropy": -58.220497131347656,
      "objective/kl": 41.770606994628906,
      "objective/non_score_reward": -2.0885305404663086,
      "objective/rlhf_reward": -6.620788232485452,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 17.74094581604004,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.466796875,
      "step": 643,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.995949149131775
    },
    {
      "episode": 10320,
      "epoch": 0.06183269223855915,
      "loss/policy_avg": 0.017528323456645012,
      "lr": 9.588445807770962e-06,
      "objective/entropy": -208.79119873046875,
      "objective/kl": 23.041034698486328,
      "objective/non_score_reward": -1.1520518064498901,
      "objective/rlhf_reward": -3.092435383590397,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 1.83624267578125,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.732421875,
      "step": 644,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0010547637939453
    },
    {
      "episode": 10336,
      "epoch": 0.0619285568776887,
      "loss/policy_avg": 0.15500307083129883,
      "lr": 9.587806748466259e-06,
      "objective/entropy": -124.78570556640625,
      "objective/kl": 34.243202209472656,
      "objective/non_score_reward": -1.7121602296829224,
      "objective/rlhf_reward": -3.92492190444586,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 2.4558181762695312,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5078125,
      "step": 645,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9997308254241943
    },
    {
      "episode": 10352,
      "epoch": 0.06202442151681825,
      "loss/policy_avg": 0.2161247283220291,
      "lr": 9.587167689161556e-06,
      "objective/entropy": -163.63064575195312,
      "objective/kl": 25.873336791992188,
      "objective/non_score_reward": -1.293666958808899,
      "objective/rlhf_reward": -3.7960657263673365,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 8.89102840423584,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.5234375,
      "step": 646,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.998837947845459
    },
    {
      "episode": 10368,
      "epoch": 0.0621202861559478,
      "loss/policy_avg": 0.08966261148452759,
      "lr": 9.586528629856851e-06,
      "objective/entropy": -104.2444076538086,
      "objective/kl": 33.29509735107422,
      "objective/non_score_reward": -1.664754867553711,
      "objective/rlhf_reward": -4.925686256090799,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 1.3677499294281006,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62890625,
      "step": 647,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9993959665298462
    },
    {
      "episode": 10384,
      "epoch": 0.06221615079507735,
      "loss/policy_avg": -0.02724701538681984,
      "lr": 9.585889570552148e-06,
      "objective/entropy": -133.99429321289062,
      "objective/kl": 27.543067932128906,
      "objective/non_score_reward": -1.3771533966064453,
      "objective/rlhf_reward": -3.1086136460304257,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 7.215035438537598,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.677734375,
      "step": 648,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9986003637313843
    },
    {
      "episode": 10400,
      "epoch": 0.0623120154342069,
      "loss/policy_avg": -0.23539991676807404,
      "lr": 9.585250511247445e-06,
      "objective/entropy": -167.906494140625,
      "objective/kl": 25.879772186279297,
      "objective/non_score_reward": -1.293988585472107,
      "objective/rlhf_reward": -3.571834478441792,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 2.0954341888427734,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.57421875,
      "step": 649,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9997683763504028
    },
    {
      "episode": 10416,
      "epoch": 0.06240788007333645,
      "loss/policy_avg": 0.30569222569465637,
      "lr": 9.584611451942742e-06,
      "objective/entropy": -226.60678100585938,
      "objective/kl": 28.675113677978516,
      "objective/non_score_reward": -1.433755874633789,
      "objective/rlhf_reward": -3.7876121503877,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 52.77922058105469,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6484375,
      "step": 650,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.9980084896087646
    },
    {
      "episode": 10432,
      "epoch": 0.062503744712466,
      "loss/policy_avg": -0.24214023351669312,
      "lr": 9.583972392638038e-06,
      "objective/entropy": -121.17498779296875,
      "objective/kl": 38.84062957763672,
      "objective/non_score_reward": -1.9420316219329834,
      "objective/rlhf_reward": -5.820715139584477,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 1.8967432975769043,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.623046875,
      "step": 651,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0004329681396484
    },
    {
      "episode": 10448,
      "epoch": 0.06259960935159554,
      "loss/policy_avg": -0.3156575858592987,
      "lr": 9.583333333333335e-06,
      "objective/entropy": -146.38143920898438,
      "objective/kl": 32.020687103271484,
      "objective/non_score_reward": -1.60103440284729,
      "objective/rlhf_reward": -5.062502017527251,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 2.199296236038208,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.642578125,
      "step": 652,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0025553703308105
    },
    {
      "episode": 10464,
      "epoch": 0.0626954739907251,
      "loss/policy_avg": 0.07271748781204224,
      "lr": 9.58269427402863e-06,
      "objective/entropy": -196.48562622070312,
      "objective/kl": 28.001068115234375,
      "objective/non_score_reward": -1.4000535011291504,
      "objective/rlhf_reward": -4.2002141833305355,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 24.475753784179688,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6875,
      "step": 653,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0010745525360107
    },
    {
      "episode": 10480,
      "epoch": 0.06279133862985464,
      "loss/policy_avg": 0.17373695969581604,
      "lr": 9.582055214723927e-06,
      "objective/entropy": -275.5335388183594,
      "objective/kl": 27.79926300048828,
      "objective/non_score_reward": -1.3899632692337036,
      "objective/rlhf_reward": -5.5598530769348145,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 17.22200584411621,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.615234375,
      "step": 654,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9987027645111084
    },
    {
      "episode": 10496,
      "epoch": 0.0628872032689842,
      "loss/policy_avg": 0.15186084806919098,
      "lr": 9.581416155419224e-06,
      "objective/entropy": -197.2568817138672,
      "objective/kl": 23.105377197265625,
      "objective/non_score_reward": -1.1552690267562866,
      "objective/rlhf_reward": -2.796247239383768,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 35.64599609375,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7734375,
      "step": 655,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9985601902008057
    },
    {
      "episode": 10512,
      "epoch": 0.06298306790811374,
      "loss/policy_avg": 0.09821736067533493,
      "lr": 9.58077709611452e-06,
      "objective/entropy": -192.20767211914062,
      "objective/kl": 28.659635543823242,
      "objective/non_score_reward": -1.4329817295074463,
      "objective/rlhf_reward": -4.070067649305449,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 5.6847333908081055,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.712890625,
      "step": 656,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9977080821990967
    },
    {
      "episode": 10528,
      "epoch": 0.0630789325472433,
      "loss/policy_avg": 0.24115119874477386,
      "lr": 9.580138036809816e-06,
      "objective/entropy": -171.08619689941406,
      "objective/kl": 26.453920364379883,
      "objective/non_score_reward": -1.3226962089538574,
      "objective/rlhf_reward": -3.8907844781875607,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 11.276920318603516,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6875,
      "step": 657,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.999925136566162
    },
    {
      "episode": 10544,
      "epoch": 0.06317479718637284,
      "loss/policy_avg": -0.04878993332386017,
      "lr": 9.579498977505113e-06,
      "objective/entropy": -95.69158172607422,
      "objective/kl": 26.445575714111328,
      "objective/non_score_reward": -1.3222787380218506,
      "objective/rlhf_reward": -3.94747917941156,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 5.285589218139648,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.625,
      "step": 658,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0013298988342285
    },
    {
      "episode": 10560,
      "epoch": 0.0632706618255024,
      "loss/policy_avg": -0.10105658322572708,
      "lr": 9.57885991820041e-06,
      "objective/entropy": -209.01065063476562,
      "objective/kl": 27.234224319458008,
      "objective/non_score_reward": -1.3617112636566162,
      "objective/rlhf_reward": -4.046844816207885,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 2.436962366104126,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.6328125,
      "step": 659,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.001246690750122
    },
    {
      "episode": 10576,
      "epoch": 0.06336652646463194,
      "loss/policy_avg": -0.3218346834182739,
      "lr": 9.578220858895705e-06,
      "objective/entropy": -3.9748001098632812,
      "objective/kl": 18.186880111694336,
      "objective/non_score_reward": -0.9093440771102905,
      "objective/rlhf_reward": -1.5146698824324945,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 28.07345962524414,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.8359375,
      "step": 660,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0002238750457764
    },
    {
      "episode": 10592,
      "epoch": 0.06346239110376149,
      "loss/policy_avg": -0.19762462377548218,
      "lr": 9.577581799591002e-06,
      "objective/entropy": -204.72760009765625,
      "objective/kl": 18.785112380981445,
      "objective/non_score_reward": -0.9392555356025696,
      "objective/rlhf_reward": -1.6343160293259955,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 1.8940598964691162,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.560546875,
      "step": 661,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0025925636291504
    },
    {
      "episode": 10608,
      "epoch": 0.06355825574289103,
      "loss/policy_avg": -0.45743584632873535,
      "lr": 9.576942740286299e-06,
      "objective/entropy": -134.4844970703125,
      "objective/kl": 33.7373046875,
      "objective/non_score_reward": -1.6868653297424316,
      "objective/rlhf_reward": -5.296863298030242,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 2.153486967086792,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.521484375,
      "step": 662,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.00048828125
    },
    {
      "episode": 10624,
      "epoch": 0.06365412038202059,
      "loss/policy_avg": 0.2565079629421234,
      "lr": 9.576303680981596e-06,
      "objective/entropy": -180.13528442382812,
      "objective/kl": 17.24534034729004,
      "objective/non_score_reward": -0.862267017364502,
      "objective/rlhf_reward": -2.089818143580837,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 7.433453559875488,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.626953125,
      "step": 663,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9994475841522217
    },
    {
      "episode": 10640,
      "epoch": 0.06374998502115013,
      "loss/policy_avg": 0.17452527582645416,
      "lr": 9.575664621676893e-06,
      "objective/entropy": -64.2728271484375,
      "objective/kl": 21.405649185180664,
      "objective/non_score_reward": -1.0702824592590332,
      "objective/rlhf_reward": -2.9218800303682517,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 1.6351606845855713,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.71484375,
      "step": 664,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0012965202331543
    },
    {
      "episode": 10656,
      "epoch": 0.06384584966027969,
      "loss/policy_avg": 0.6966801881790161,
      "lr": 9.57502556237219e-06,
      "objective/entropy": -251.04238891601562,
      "objective/kl": 27.693851470947266,
      "objective/non_score_reward": -1.384692668914795,
      "objective/rlhf_reward": -3.934650454584675,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 10.390886306762695,
      "policy/clipfrac_avg": 0.25,
      "policy/entropy_avg": 0.5859375,
      "step": 665,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.001401662826538
    },
    {
      "episode": 10672,
      "epoch": 0.06394171429940923,
      "loss/policy_avg": 0.16458481550216675,
      "lr": 9.574386503067485e-06,
      "objective/entropy": -219.99136352539062,
      "objective/kl": 13.308931350708008,
      "objective/non_score_reward": -0.6654465198516846,
      "objective/rlhf_reward": -0.7143749100732166,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 3.77976131439209,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.69140625,
      "step": 666,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000656843185425
    },
    {
      "episode": 10688,
      "epoch": 0.06403757893853879,
      "loss/policy_avg": -0.009436726570129395,
      "lr": 9.573747443762782e-06,
      "objective/entropy": -162.25047302246094,
      "objective/kl": 23.977962493896484,
      "objective/non_score_reward": -1.1988980770111084,
      "objective/rlhf_reward": -2.8481810791062667,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 22.450942993164062,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.763671875,
      "step": 667,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0019123554229736
    },
    {
      "episode": 10704,
      "epoch": 0.06413344357766833,
      "loss/policy_avg": 0.4135128855705261,
      "lr": 9.573108384458079e-06,
      "objective/entropy": -63.0797119140625,
      "objective/kl": 41.37904739379883,
      "objective/non_score_reward": -2.0689523220062256,
      "objective/rlhf_reward": -6.542475895086923,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 88.98745727539062,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.779296875,
      "step": 668,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9999923706054688
    },
    {
      "episode": 10720,
      "epoch": 0.06422930821679788,
      "loss/policy_avg": 0.6821532845497131,
      "lr": 9.572469325153375e-06,
      "objective/entropy": -196.7287139892578,
      "objective/kl": 30.88260269165039,
      "objective/non_score_reward": -1.5441300868988037,
      "objective/rlhf_reward": -4.660748505386051,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 23.963293075561523,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.78515625,
      "step": 669,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9989004135131836
    },
    {
      "episode": 10736,
      "epoch": 0.06432517285592743,
      "loss/policy_avg": 0.3629915118217468,
      "lr": 9.571830265848672e-06,
      "objective/entropy": -205.541259765625,
      "objective/kl": 24.442432403564453,
      "objective/non_score_reward": -1.2221217155456543,
      "objective/rlhf_reward": -3.155153171221415,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 15.010305404663086,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.63671875,
      "step": 670,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9991261959075928
    },
    {
      "episode": 10752,
      "epoch": 0.06442103749505698,
      "loss/policy_avg": 0.3024546504020691,
      "lr": 9.571191206543968e-06,
      "objective/entropy": -184.0182647705078,
      "objective/kl": 28.46197509765625,
      "objective/non_score_reward": -1.4230988025665283,
      "objective/rlhf_reward": -3.744983862118657,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 2.1509013175964355,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.611328125,
      "step": 671,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.998319149017334
    },
    {
      "episode": 10768,
      "epoch": 0.06451690213418652,
      "loss/policy_avg": -0.12359270453453064,
      "lr": 9.570552147239264e-06,
      "objective/entropy": -107.1251220703125,
      "objective/kl": 24.85216522216797,
      "objective/non_score_reward": -1.2426085472106934,
      "objective/rlhf_reward": -3.611183964942379,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 2.815180540084839,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.62890625,
      "step": 672,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.002232074737549
    },
    {
      "episode": 10784,
      "epoch": 0.06461276677331608,
      "loss/policy_avg": 0.3783743977546692,
      "lr": 9.569913087934561e-06,
      "objective/entropy": -155.0634765625,
      "objective/kl": 33.26643371582031,
      "objective/non_score_reward": -1.663321852684021,
      "objective/rlhf_reward": -5.294037544463558,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 7.487679958343506,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.751953125,
      "step": 673,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9973864555358887
    },
    {
      "episode": 10800,
      "epoch": 0.06470863141244562,
      "loss/policy_avg": 0.12491178512573242,
      "lr": 9.569274028629858e-06,
      "objective/entropy": -202.8880157470703,
      "objective/kl": 23.53227996826172,
      "objective/non_score_reward": -1.1766140460968018,
      "objective/rlhf_reward": -2.9731229106585184,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 5.709697246551514,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.576171875,
      "step": 674,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9979543685913086
    },
    {
      "episode": 10816,
      "epoch": 0.06480449605157518,
      "loss/policy_avg": -0.01751142367720604,
      "lr": 9.568634969325155e-06,
      "objective/entropy": -217.27896118164062,
      "objective/kl": 27.020957946777344,
      "objective/non_score_reward": -1.3510478734970093,
      "objective/rlhf_reward": -3.4567805034684493,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 0.6378078460693359,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.63671875,
      "step": 675,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0012476444244385
    },
    {
      "episode": 10832,
      "epoch": 0.06490036069070472,
      "loss/policy_avg": 0.28126630187034607,
      "lr": 9.567995910020452e-06,
      "objective/entropy": -230.15963745117188,
      "objective/kl": 24.95879364013672,
      "objective/non_score_reward": -1.2479398250579834,
      "objective/rlhf_reward": -3.329899912298308,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 10.301782608032227,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.744140625,
      "step": 676,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9993771314620972
    },
    {
      "episode": 10848,
      "epoch": 0.06499622532983428,
      "loss/policy_avg": 0.12287623435258865,
      "lr": 9.567356850715747e-06,
      "objective/entropy": -263.37542724609375,
      "objective/kl": 23.937744140625,
      "objective/non_score_reward": -1.1968872547149658,
      "objective/rlhf_reward": -0.3875493764877316,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 45.05952453613281,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.703125,
      "step": 677,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9995876550674438
    },
    {
      "episode": 10864,
      "epoch": 0.06509208996896382,
      "loss/policy_avg": 0.6470179557800293,
      "lr": 9.566717791411044e-06,
      "objective/entropy": -65.45881652832031,
      "objective/kl": 23.807559967041016,
      "objective/non_score_reward": -1.190378189086914,
      "objective/rlhf_reward": -3.419877028375297,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 10.65350341796875,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.744140625,
      "step": 678,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.999696969985962
    },
    {
      "episode": 10880,
      "epoch": 0.06518795460809337,
      "loss/policy_avg": 0.2790781855583191,
      "lr": 9.56607873210634e-06,
      "objective/entropy": -161.4605712890625,
      "objective/kl": 41.620460510253906,
      "objective/non_score_reward": -2.0810232162475586,
      "objective/rlhf_reward": -3.924092388153076,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 5.482306480407715,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.57421875,
      "step": 679,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9986392259597778
    },
    {
      "episode": 10896,
      "epoch": 0.06528381924722292,
      "loss/policy_avg": 0.042992569506168365,
      "lr": 9.565439672801636e-06,
      "objective/entropy": -162.92010498046875,
      "objective/kl": 26.902143478393555,
      "objective/non_score_reward": -1.3451071977615356,
      "objective/rlhf_reward": -4.001826503363949,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 4.27599573135376,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.654296875,
      "step": 680,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.998090147972107
    },
    {
      "episode": 10912,
      "epoch": 0.06537968388635247,
      "loss/policy_avg": 0.20157073438167572,
      "lr": 9.564800613496933e-06,
      "objective/entropy": -265.3901672363281,
      "objective/kl": 29.956632614135742,
      "objective/non_score_reward": -1.4978315830230713,
      "objective/rlhf_reward": -3.868620397821937,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 68.22042846679688,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.609375,
      "step": 681,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9982101917266846
    },
    {
      "episode": 10928,
      "epoch": 0.06547554852548201,
      "loss/policy_avg": 1.519484281539917,
      "lr": 9.56416155419223e-06,
      "objective/entropy": -127.62720489501953,
      "objective/kl": 23.382505416870117,
      "objective/non_score_reward": -1.1691253185272217,
      "objective/rlhf_reward": -2.2765009164810177,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 17.878856658935547,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.419921875,
      "step": 682,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9984099864959717
    },
    {
      "episode": 10944,
      "epoch": 0.06557141316461157,
      "loss/policy_avg": 0.3158057928085327,
      "lr": 9.563522494887527e-06,
      "objective/entropy": -190.45260620117188,
      "objective/kl": 25.518230438232422,
      "objective/non_score_reward": -1.275911569595337,
      "objective/rlhf_reward": -3.622693660672068,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 34.12330627441406,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.63671875,
      "step": 683,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000800371170044
    },
    {
      "episode": 10960,
      "epoch": 0.06566727780374111,
      "loss/policy_avg": 1.1294161081314087,
      "lr": 9.562883435582822e-06,
      "objective/entropy": -107.20721435546875,
      "objective/kl": 32.379913330078125,
      "objective/non_score_reward": -1.6189957857131958,
      "objective/rlhf_reward": -5.13434737017694,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 7.272080421447754,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.5234375,
      "step": 684,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.998885154724121
    },
    {
      "episode": 10976,
      "epoch": 0.06576314244287067,
      "loss/policy_avg": 0.44281357526779175,
      "lr": 9.562244376278119e-06,
      "objective/entropy": -128.0640869140625,
      "objective/kl": 20.03044891357422,
      "objective/non_score_reward": -1.0015225410461426,
      "objective/rlhf_reward": -1.082371120096418,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 12.73418140411377,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.634765625,
      "step": 685,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.999821424484253
    },
    {
      "episode": 10992,
      "epoch": 0.06585900708200021,
      "loss/policy_avg": 0.2683737576007843,
      "lr": 9.561605316973416e-06,
      "objective/entropy": -258.8201904296875,
      "objective/kl": 27.295347213745117,
      "objective/non_score_reward": -1.3647674322128296,
      "objective/rlhf_reward": -2.535350595356199,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 5.86362886428833,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.71875,
      "step": 686,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9981340169906616
    },
    {
      "episode": 11008,
      "epoch": 0.06595487172112977,
      "loss/policy_avg": -0.14624132215976715,
      "lr": 9.560966257668713e-06,
      "objective/entropy": -96.99462890625,
      "objective/kl": 30.466350555419922,
      "objective/non_score_reward": -1.523317575454712,
      "objective/rlhf_reward": -4.57749816158646,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 8.779112815856934,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.494140625,
      "step": 687,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9981212615966797
    },
    {
      "episode": 11024,
      "epoch": 0.06605073636025931,
      "loss/policy_avg": 0.12842759490013123,
      "lr": 9.56032719836401e-06,
      "objective/entropy": -166.20689392089844,
      "objective/kl": 26.250516891479492,
      "objective/non_score_reward": -1.312525749206543,
      "objective/rlhf_reward": -2.8501029968261715,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 7.160890102386475,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.5703125,
      "step": 688,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9990025758743286
    },
    {
      "episode": 11040,
      "epoch": 0.06614660099938886,
      "loss/policy_avg": 0.2923339009284973,
      "lr": 9.559688139059306e-06,
      "objective/entropy": -236.72100830078125,
      "objective/kl": 33.81795883178711,
      "objective/non_score_reward": -1.6908979415893555,
      "objective/rlhf_reward": -5.4219562321001575,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 16.3193359375,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.650390625,
      "step": 689,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.99735426902771
    },
    {
      "episode": 11056,
      "epoch": 0.0662424656385184,
      "loss/policy_avg": -0.10266150534152985,
      "lr": 9.559049079754601e-06,
      "objective/entropy": -85.62126159667969,
      "objective/kl": 31.331233978271484,
      "objective/non_score_reward": -1.5665616989135742,
      "objective/rlhf_reward": -4.143540324942146,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 6.518294811248779,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.794921875,
      "step": 690,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.006378650665283
    },
    {
      "episode": 11072,
      "epoch": 0.06633833027764796,
      "loss/policy_avg": 0.17208513617515564,
      "lr": 9.558410020449898e-06,
      "objective/entropy": -175.00662231445312,
      "objective/kl": 33.992698669433594,
      "objective/non_score_reward": -1.6996350288391113,
      "objective/rlhf_reward": -5.4392902490839194,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 32.03794860839844,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.716796875,
      "step": 691,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9989235401153564
    },
    {
      "episode": 11088,
      "epoch": 0.06643419491677752,
      "loss/policy_avg": 0.01335047371685505,
      "lr": 9.557770961145195e-06,
      "objective/entropy": -248.65049743652344,
      "objective/kl": 22.41885757446289,
      "objective/non_score_reward": -1.1209429502487183,
      "objective/rlhf_reward": -2.536360512452062,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 2.7352328300476074,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.673828125,
      "step": 692,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0011818408966064
    },
    {
      "episode": 11104,
      "epoch": 0.06653005955590706,
      "loss/policy_avg": 0.14417897164821625,
      "lr": 9.557131901840492e-06,
      "objective/entropy": -218.454345703125,
      "objective/kl": 15.86509895324707,
      "objective/non_score_reward": -0.7932549715042114,
      "objective/rlhf_reward": 1.226980143785477,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 1.0328912734985352,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.640625,
      "step": 693,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 2.0003151893615723
    },
    {
      "episode": 11120,
      "epoch": 0.06662592419503661,
      "loss/policy_avg": 0.09597369283437729,
      "lr": 9.556492842535789e-06,
      "objective/entropy": -175.68487548828125,
      "objective/kl": 32.48929977416992,
      "objective/non_score_reward": -1.624464988708496,
      "objective/rlhf_reward": -2.0978601336479183,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 3.689056396484375,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.705078125,
      "step": 694,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.998678207397461
    },
    {
      "episode": 11136,
      "epoch": 0.06672178883416616,
      "loss/policy_avg": -0.004386359825730324,
      "lr": 9.555853783231084e-06,
      "objective/entropy": 122.54474639892578,
      "objective/kl": 42.134315490722656,
      "objective/non_score_reward": -2.106715679168701,
      "objective/rlhf_reward": -6.822743091646748,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 6.307683944702148,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.7421875,
      "step": 695,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9999419450759888
    },
    {
      "episode": 11152,
      "epoch": 0.06681765347329571,
      "loss/policy_avg": 0.3615373373031616,
      "lr": 9.555214723926381e-06,
      "objective/entropy": -260.84075927734375,
      "objective/kl": 35.725467681884766,
      "objective/non_score_reward": -1.7862732410430908,
      "objective/rlhf_reward": -5.664140108044505,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 45.438873291015625,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.685546875,
      "step": 696,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.996116280555725
    },
    {
      "episode": 11168,
      "epoch": 0.06691351811242525,
      "loss/policy_avg": 0.24602335691452026,
      "lr": 9.554575664621678e-06,
      "objective/entropy": -71.92741394042969,
      "objective/kl": 30.083784103393555,
      "objective/non_score_reward": -1.5041892528533936,
      "objective/rlhf_reward": -4.6575073835596275,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 5.438946723937988,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4765625,
      "step": 697,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.998178482055664
    },
    {
      "episode": 11184,
      "epoch": 0.06700938275155481,
      "loss/policy_avg": 0.034039177000522614,
      "lr": 9.553936605316975e-06,
      "objective/entropy": -198.67774963378906,
      "objective/kl": 23.375925064086914,
      "objective/non_score_reward": -1.1687963008880615,
      "objective/rlhf_reward": -1.7514660700571265,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 0.5530495643615723,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.53125,
      "step": 698,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.00034236907959
    },
    {
      "episode": 11200,
      "epoch": 0.06710524739068435,
      "loss/policy_avg": 0.5306535959243774,
      "lr": 9.553297546012272e-06,
      "objective/entropy": -143.43771362304688,
      "objective/kl": 35.411888122558594,
      "objective/non_score_reward": -1.7705943584442139,
      "objective/rlhf_reward": -5.63177965125595,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 6.416120529174805,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.66796875,
      "step": 699,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9994237422943115
    },
    {
      "episode": 11216,
      "epoch": 0.06720111202981391,
      "loss/policy_avg": 0.2092888504266739,
      "lr": 9.552658486707569e-06,
      "objective/entropy": -169.036376953125,
      "objective/kl": 30.64543914794922,
      "objective/non_score_reward": -1.5322721004486084,
      "objective/rlhf_reward": -1.7290880441665646,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 132.6121063232422,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.564453125,
      "step": 700,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9992992877960205
    },
    {
      "episode": 11232,
      "epoch": 0.06729697666894345,
      "loss/policy_avg": 0.2553282380104065,
      "lr": 9.552019427402864e-06,
      "objective/entropy": -145.8370361328125,
      "objective/kl": 31.58509063720703,
      "objective/non_score_reward": -1.5792546272277832,
      "objective/rlhf_reward": -4.760759084430292,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 23.342622756958008,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.712890625,
      "step": 701,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.0007388591766357
    },
    {
      "episode": 11248,
      "epoch": 0.067392841308073,
      "loss/policy_avg": 0.1272473782300949,
      "lr": 9.55138036809816e-06,
      "objective/entropy": -283.0919494628906,
      "objective/kl": 18.825233459472656,
      "objective/non_score_reward": -0.9412617683410645,
      "objective/rlhf_reward": -2.4057970878824424,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 1.5947492122650146,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.70703125,
      "step": 702,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.999638557434082
    },
    {
      "episode": 11264,
      "epoch": 0.06748870594720255,
      "loss/policy_avg": 0.2034430205821991,
      "lr": 9.550741308793456e-06,
      "objective/entropy": -274.40478515625,
      "objective/kl": 20.724695205688477,
      "objective/non_score_reward": -1.0362348556518555,
      "objective/rlhf_reward": -1.221220110298368,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 3.738941192626953,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.61328125,
      "step": 703,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.997761607170105
    },
    {
      "episode": 11280,
      "epoch": 0.0675845705863321,
      "loss/policy_avg": 0.7114033699035645,
      "lr": 9.550102249488753e-06,
      "objective/entropy": -135.6627960205078,
      "objective/kl": 27.718311309814453,
      "objective/non_score_reward": -1.3859155178070068,
      "objective/rlhf_reward": -3.5962508422898605,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 32.94233703613281,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.64453125,
      "step": 704,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9985225200653076
    },
    {
      "episode": 11296,
      "epoch": 0.06768043522546165,
      "loss/policy_avg": -0.08856553584337234,
      "lr": 9.54946319018405e-06,
      "objective/entropy": -172.419921875,
      "objective/kl": 31.078826904296875,
      "objective/non_score_reward": -1.553941249847412,
      "objective/rlhf_reward": -4.765167097659454,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 27.00151824951172,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.76171875,
      "step": 705,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.003095865249634
    },
    {
      "episode": 11312,
      "epoch": 0.0677762998645912,
      "loss/policy_avg": -0.1016867533326149,
      "lr": 9.548824130879346e-06,
      "objective/entropy": -186.52476501464844,
      "objective/kl": 30.371601104736328,
      "objective/non_score_reward": -1.5185801982879639,
      "objective/rlhf_reward": -4.593368175442576,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 7.805020332336426,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.68359375,
      "step": 706,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.0018138885498047
    },
    {
      "episode": 11328,
      "epoch": 0.06787216450372074,
      "loss/policy_avg": 0.3950710892677307,
      "lr": 9.548185071574643e-06,
      "objective/entropy": -169.30099487304688,
      "objective/kl": 26.604206085205078,
      "objective/non_score_reward": -1.3302103281021118,
      "objective/rlhf_reward": -3.9422390843308985,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 3.9309802055358887,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.650390625,
      "step": 707,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0003857612609863
    },
    {
      "episode": 11344,
      "epoch": 0.0679680291428503,
      "loss/policy_avg": 0.15957045555114746,
      "lr": 9.547546012269938e-06,
      "objective/entropy": -152.48211669921875,
      "objective/kl": 28.93355941772461,
      "objective/non_score_reward": -1.4466780424118042,
      "objective/rlhf_reward": -4.124852543295012,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 30.355663299560547,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.728515625,
      "step": 708,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 9,
      "val/ratio": 1.9971683025360107
    },
    {
      "episode": 11360,
      "epoch": 0.06806389378197984,
      "loss/policy_avg": 0.1635814905166626,
      "lr": 9.546906952965235e-06,
      "objective/entropy": -225.05284118652344,
      "objective/kl": 32.07009506225586,
      "objective/non_score_reward": -1.6035047769546509,
      "objective/rlhf_reward": -5.088506314784212,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 25.63396453857422,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.708984375,
      "step": 709,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9975805282592773
    },
    {
      "episode": 11376,
      "epoch": 0.0681597584211094,
      "loss/policy_avg": 0.22918304800987244,
      "lr": 9.546267893660532e-06,
      "objective/entropy": -245.11099243164062,
      "objective/kl": 31.21074867248535,
      "objective/non_score_reward": -1.560537576675415,
      "objective/rlhf_reward": -4.5802904419308765,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 14.6522216796875,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.658203125,
      "step": 710,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9971773624420166
    },
    {
      "episode": 11392,
      "epoch": 0.06825562306023894,
      "loss/policy_avg": -0.15267148613929749,
      "lr": 9.545628834355829e-06,
      "objective/entropy": -26.006134033203125,
      "objective/kl": 25.76430320739746,
      "objective/non_score_reward": -1.288215160369873,
      "objective/rlhf_reward": -3.2054496509599044,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 2.9515511989593506,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.75,
      "step": 711,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0021400451660156
    },
    {
      "episode": 11408,
      "epoch": 0.0683514876993685,
      "loss/policy_avg": 0.03201477974653244,
      "lr": 9.544989775051126e-06,
      "objective/entropy": -229.9574737548828,
      "objective/kl": 31.691633224487305,
      "objective/non_score_reward": -1.5845816135406494,
      "objective/rlhf_reward": -4.887728492827758,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 81.25225830078125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.74609375,
      "step": 712,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0012998580932617
    },
    {
      "episode": 11424,
      "epoch": 0.06844735233849804,
      "loss/policy_avg": 0.5598920583724976,
      "lr": 9.544350715746423e-06,
      "objective/entropy": -198.39407348632812,
      "objective/kl": 22.02547264099121,
      "objective/non_score_reward": -1.1012736558914185,
      "objective/rlhf_reward": -3.045844846700115,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 7.494403839111328,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6640625,
      "step": 713,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0001235008239746
    },
    {
      "episode": 11440,
      "epoch": 0.0685432169776276,
      "loss/policy_avg": 0.14270013570785522,
      "lr": 9.543711656441718e-06,
      "objective/entropy": -281.67730712890625,
      "objective/kl": 30.167518615722656,
      "objective/non_score_reward": -1.5083760023117065,
      "objective/rlhf_reward": -4.517732465060886,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 42.272212982177734,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6171875,
      "step": 714,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9981521368026733
    },
    {
      "episode": 11456,
      "epoch": 0.06863908161675714,
      "loss/policy_avg": 0.23854002356529236,
      "lr": 9.543072597137015e-06,
      "objective/entropy": -205.70501708984375,
      "objective/kl": 26.037616729736328,
      "objective/non_score_reward": -1.3018807172775269,
      "objective/rlhf_reward": -3.603402886454182,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 21.1671085357666,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.736328125,
      "step": 715,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.999593734741211
    },
    {
      "episode": 11472,
      "epoch": 0.06873494625588669,
      "loss/policy_avg": 0.25810641050338745,
      "lr": 9.542433537832312e-06,
      "objective/entropy": -202.4583740234375,
      "objective/kl": 26.777297973632812,
      "objective/non_score_reward": -1.338865041732788,
      "objective/rlhf_reward": -3.7513400054612926,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 4.448478698730469,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.7890625,
      "step": 716,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999415636062622
    },
    {
      "episode": 11488,
      "epoch": 0.06883081089501623,
      "loss/policy_avg": 0.16866181790828705,
      "lr": 9.541794478527609e-06,
      "objective/entropy": -174.37855529785156,
      "objective/kl": 34.941444396972656,
      "objective/non_score_reward": -1.7470722198486328,
      "objective/rlhf_reward": -5.43202957412298,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 0.9149700403213501,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.69140625,
      "step": 717,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 2.000814437866211
    },
    {
      "episode": 11504,
      "epoch": 0.06892667553414579,
      "loss/policy_avg": 0.20718123018741608,
      "lr": 9.541155419222906e-06,
      "objective/entropy": -75.93595123291016,
      "objective/kl": 37.52787780761719,
      "objective/non_score_reward": -1.8763937950134277,
      "objective/rlhf_reward": -6.024622860367655,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 3.859286308288574,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5859375,
      "step": 718,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9990894794464111
    },
    {
      "episode": 11520,
      "epoch": 0.06902254017327533,
      "loss/policy_avg": -0.14078834652900696,
      "lr": 9.5405163599182e-06,
      "objective/entropy": -111.06301879882812,
      "objective/kl": 37.833980560302734,
      "objective/non_score_reward": -1.8916990756988525,
      "objective/rlhf_reward": -5.44408971287397,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 1.0138969421386719,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.646484375,
      "step": 719,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0016212463378906
    },
    {
      "episode": 11536,
      "epoch": 0.06911840481240489,
      "loss/policy_avg": -0.02326921373605728,
      "lr": 9.539877300613498e-06,
      "objective/entropy": -7.474525451660156,
      "objective/kl": 37.21611785888672,
      "objective/non_score_reward": -1.860805869102478,
      "objective/rlhf_reward": -7.443223357200623,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 0.989769458770752,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.5546875,
      "step": 720,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0015361309051514
    },
    {
      "episode": 11552,
      "epoch": 0.06921426945153443,
      "loss/policy_avg": 0.9960123896598816,
      "lr": 9.539238241308795e-06,
      "objective/entropy": -102.21640014648438,
      "objective/kl": 29.624881744384766,
      "objective/non_score_reward": -1.4812440872192383,
      "objective/rlhf_reward": -3.9775650007294967,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 1.5700416564941406,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.568359375,
      "step": 721,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.000302314758301
    },
    {
      "episode": 11568,
      "epoch": 0.06931013409066399,
      "loss/policy_avg": -0.022494332864880562,
      "lr": 9.538599182004091e-06,
      "objective/entropy": -97.00556182861328,
      "objective/kl": 34.23220443725586,
      "objective/non_score_reward": -1.7116100788116455,
      "objective/rlhf_reward": -5.520927820235414,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 6.0028605461120605,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.625,
      "step": 722,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0014429092407227
    },
    {
      "episode": 11584,
      "epoch": 0.06940599872979353,
      "loss/policy_avg": 0.2970792055130005,
      "lr": 9.537960122699387e-06,
      "objective/entropy": -218.43130493164062,
      "objective/kl": 23.677339553833008,
      "objective/non_score_reward": -1.1838669776916504,
      "objective/rlhf_reward": -0.335467970371246,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 35.85502624511719,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.744140625,
      "step": 723,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9957175254821777
    },
    {
      "episode": 11600,
      "epoch": 0.06950186336892308,
      "loss/policy_avg": 0.09062906354665756,
      "lr": 9.537321063394683e-06,
      "objective/entropy": -145.62179565429688,
      "objective/kl": 19.510597229003906,
      "objective/non_score_reward": -0.9755299091339111,
      "objective/rlhf_reward": -2.560484102278381,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 5.657525539398193,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.671875,
      "step": 724,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.002211809158325
    },
    {
      "episode": 11616,
      "epoch": 0.06959772800805263,
      "loss/policy_avg": 0.5650205612182617,
      "lr": 9.53668200408998e-06,
      "objective/entropy": -189.58197021484375,
      "objective/kl": 22.43151092529297,
      "objective/non_score_reward": -1.1215755939483643,
      "objective/rlhf_reward": -3.1446669011408384,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 16.189781188964844,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.69140625,
      "step": 725,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9978678226470947
    },
    {
      "episode": 11632,
      "epoch": 0.06969359264718218,
      "loss/policy_avg": 0.10538655519485474,
      "lr": 9.536042944785277e-06,
      "objective/entropy": -262.17254638671875,
      "objective/kl": 21.21435546875,
      "objective/non_score_reward": -1.0607177019119263,
      "objective/rlhf_reward": -2.1201648137727123,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 5.554556846618652,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.7578125,
      "step": 726,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9994568824768066
    },
    {
      "episode": 11648,
      "epoch": 0.06978945728631172,
      "loss/policy_avg": 0.08264347910881042,
      "lr": 9.535403885480572e-06,
      "objective/entropy": -144.35389709472656,
      "objective/kl": 23.849288940429688,
      "objective/non_score_reward": -1.1924644708633423,
      "objective/rlhf_reward": -3.2889052657440896,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 0.2577582895755768,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.60546875,
      "step": 727,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0009419918060303
    },
    {
      "episode": 11664,
      "epoch": 0.06988532192544128,
      "loss/policy_avg": -0.11442309617996216,
      "lr": 9.53476482617587e-06,
      "objective/entropy": -161.91555786132812,
      "objective/kl": 29.32978057861328,
      "objective/non_score_reward": -1.4664889574050903,
      "objective/rlhf_reward": -4.132622496287028,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 6.162350654602051,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.58203125,
      "step": 728,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0256590843200684
    },
    {
      "episode": 11680,
      "epoch": 0.06998118656457082,
      "loss/policy_avg": 0.15979725122451782,
      "lr": 9.534125766871166e-06,
      "objective/entropy": -46.392860412597656,
      "objective/kl": 34.71672058105469,
      "objective/non_score_reward": -1.7358360290527344,
      "objective/rlhf_reward": -5.601708403139739,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 26.208736419677734,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.837890625,
      "step": 729,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9947504997253418
    },
    {
      "episode": 11696,
      "epoch": 0.07007705120370038,
      "loss/policy_avg": 0.01945001818239689,
      "lr": 9.533486707566463e-06,
      "objective/entropy": -199.32308959960938,
      "objective/kl": 20.052722930908203,
      "objective/non_score_reward": -1.002636194229126,
      "objective/rlhf_reward": -2.586712677677242,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 5.049467086791992,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.623046875,
      "step": 730,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9999655485153198
    },
    {
      "episode": 11712,
      "epoch": 0.07017291584282992,
      "loss/policy_avg": 0.22911685705184937,
      "lr": 9.53284764826176e-06,
      "objective/entropy": -199.43820190429688,
      "objective/kl": 29.375852584838867,
      "objective/non_score_reward": -1.4687926769256592,
      "objective/rlhf_reward": -3.4751707077026364,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 1.4132235050201416,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.64453125,
      "step": 731,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9989359378814697
    },
    {
      "episode": 11728,
      "epoch": 0.07026878048195948,
      "loss/policy_avg": 0.045667171478271484,
      "lr": 9.532208588957055e-06,
      "objective/entropy": -156.77005004882812,
      "objective/kl": 28.574951171875,
      "objective/non_score_reward": -1.4287474155426025,
      "objective/rlhf_reward": -4.110870037142353,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 11.299884796142578,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.587890625,
      "step": 732,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9989423751831055
    },
    {
      "episode": 11744,
      "epoch": 0.07036464512108902,
      "loss/policy_avg": -0.07621235400438309,
      "lr": 9.531569529652352e-06,
      "objective/entropy": -211.5927734375,
      "objective/kl": 25.139881134033203,
      "objective/non_score_reward": -1.2569940090179443,
      "objective/rlhf_reward": -3.2031475856629124,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 2.0796079635620117,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.548828125,
      "step": 733,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.00140118598938
    },
    {
      "episode": 11760,
      "epoch": 0.07046050976021857,
      "loss/policy_avg": 0.3665542006492615,
      "lr": 9.530930470347649e-06,
      "objective/entropy": -136.42066955566406,
      "objective/kl": 28.39642333984375,
      "objective/non_score_reward": -1.4198211431503296,
      "objective/rlhf_reward": -5.679284453392029,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 2.8006393909454346,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.673828125,
      "step": 734,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9988253116607666
    },
    {
      "episode": 11776,
      "epoch": 0.07055637439934812,
      "loss/policy_avg": -0.16624964773654938,
      "lr": 9.530291411042946e-06,
      "objective/entropy": -172.16896057128906,
      "objective/kl": 32.62467956542969,
      "objective/non_score_reward": -1.6312339305877686,
      "objective/rlhf_reward": -5.183300068884521,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 8.176142692565918,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.587890625,
      "step": 735,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0014700889587402
    },
    {
      "episode": 11792,
      "epoch": 0.07065223903847767,
      "loss/policy_avg": -0.01751716434955597,
      "lr": 9.529652351738243e-06,
      "objective/entropy": -244.469970703125,
      "objective/kl": 21.34896469116211,
      "objective/non_score_reward": -1.0674481391906738,
      "objective/rlhf_reward": -1.346073900104734,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 1.2310829162597656,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.73828125,
      "step": 736,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.0015974044799805
    },
    {
      "episode": 11808,
      "epoch": 0.07074810367760721,
      "loss/policy_avg": -0.13727766275405884,
      "lr": 9.52901329243354e-06,
      "objective/entropy": -152.7752227783203,
      "objective/kl": 30.841548919677734,
      "objective/non_score_reward": -1.5420774221420288,
      "objective/rlhf_reward": -1.7683096885681149,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 2.1432337760925293,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.732421875,
      "step": 737,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.000338315963745
    },
    {
      "episode": 11824,
      "epoch": 0.07084396831673677,
      "loss/policy_avg": 0.24724145233631134,
      "lr": 9.528374233128835e-06,
      "objective/entropy": -249.35003662109375,
      "objective/kl": 41.97819519042969,
      "objective/non_score_reward": -2.098909854888916,
      "objective/rlhf_reward": -6.945041160197601,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 14.357757568359375,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.7421875,
      "step": 738,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9984996318817139
    },
    {
      "episode": 11840,
      "epoch": 0.07093983295586631,
      "loss/policy_avg": -0.1166142150759697,
      "lr": 9.527735173824132e-06,
      "objective/entropy": 16.65149688720703,
      "objective/kl": 28.71587371826172,
      "objective/non_score_reward": -1.4357936382293701,
      "objective/rlhf_reward": -4.401539257078796,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 3.7607579231262207,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.64453125,
      "step": 739,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9990723133087158
    },
    {
      "episode": 11856,
      "epoch": 0.07103569759499587,
      "loss/policy_avg": 0.035362888127565384,
      "lr": 9.527096114519428e-06,
      "objective/entropy": -227.2210235595703,
      "objective/kl": 27.349641799926758,
      "objective/non_score_reward": -1.36748206615448,
      "objective/rlhf_reward": -3.865808401171284,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 9.06348705291748,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6640625,
      "step": 740,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9996118545532227
    },
    {
      "episode": 11872,
      "epoch": 0.07113156223412541,
      "loss/policy_avg": 0.31989267468452454,
      "lr": 9.526457055214725e-06,
      "objective/entropy": -213.7845458984375,
      "objective/kl": 34.27381896972656,
      "objective/non_score_reward": -1.713691234588623,
      "objective/rlhf_reward": -4.732058527246986,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 26.892040252685547,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.67578125,
      "step": 741,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0020368099212646
    },
    {
      "episode": 11888,
      "epoch": 0.07122742687325496,
      "loss/policy_avg": 0.18080441653728485,
      "lr": 9.525817995910022e-06,
      "objective/entropy": -164.34909057617188,
      "objective/kl": 29.15081024169922,
      "objective/non_score_reward": -1.457540512084961,
      "objective/rlhf_reward": -4.379564206214294,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 19.9893798828125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.64453125,
      "step": 742,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9979908466339111
    },
    {
      "episode": 11904,
      "epoch": 0.0713232915123845,
      "loss/policy_avg": 0.06947439908981323,
      "lr": 9.525178936605317e-06,
      "objective/entropy": -35.78013610839844,
      "objective/kl": 30.88395118713379,
      "objective/non_score_reward": -1.5441975593566895,
      "objective/rlhf_reward": -4.620531051364496,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 9.903773307800293,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.658203125,
      "step": 743,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0000925064086914
    },
    {
      "episode": 11920,
      "epoch": 0.07141915615151406,
      "loss/policy_avg": 0.4868197441101074,
      "lr": 9.524539877300614e-06,
      "objective/entropy": -185.67857360839844,
      "objective/kl": 30.794139862060547,
      "objective/non_score_reward": -1.5397069454193115,
      "objective/rlhf_reward": -4.833315048247499,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 23.752399444580078,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5703125,
      "step": 744,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9970831871032715
    },
    {
      "episode": 11936,
      "epoch": 0.0715150207906436,
      "loss/policy_avg": 0.4937871992588043,
      "lr": 9.52390081799591e-06,
      "objective/entropy": -196.15248107910156,
      "objective/kl": 32.130393981933594,
      "objective/non_score_reward": -1.6065199375152588,
      "objective/rlhf_reward": -5.084443858175903,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 4.993836402893066,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.509765625,
      "step": 745,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9994457960128784
    },
    {
      "episode": 11952,
      "epoch": 0.07161088542977316,
      "loss/policy_avg": 0.10673123598098755,
      "lr": 9.523261758691206e-06,
      "objective/entropy": -74.68463134765625,
      "objective/kl": 34.281944274902344,
      "objective/non_score_reward": -1.7140971422195435,
      "objective/rlhf_reward": -3.9326697334062786,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 9.657389640808105,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4892578125,
      "step": 746,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.998197078704834
    },
    {
      "episode": 11968,
      "epoch": 0.0717067500689027,
      "loss/policy_avg": 0.08303539454936981,
      "lr": 9.522622699386503e-06,
      "objective/entropy": -234.022705078125,
      "objective/kl": 26.956684112548828,
      "objective/non_score_reward": -1.3478342294692993,
      "objective/rlhf_reward": -3.26863074518827,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 9.614282608032227,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.77734375,
      "step": 747,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.9977872371673584
    },
    {
      "episode": 11984,
      "epoch": 0.07180261470803226,
      "loss/policy_avg": 0.006275704130530357,
      "lr": 9.5219836400818e-06,
      "objective/entropy": -179.78111267089844,
      "objective/kl": 24.191059112548828,
      "objective/non_score_reward": -1.2095528841018677,
      "objective/rlhf_reward": -3.4789615509256553,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 3.5060572624206543,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.48828125,
      "step": 748,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.001844882965088
    },
    {
      "episode": 12000,
      "epoch": 0.07189847934716181,
      "loss/policy_avg": 0.05262988060712814,
      "lr": 9.521344580777097e-06,
      "objective/entropy": -61.52648162841797,
      "objective/kl": 24.345882415771484,
      "objective/non_score_reward": -1.2172942161560059,
      "objective/rlhf_reward": -2.7464705727258067,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 26.343456268310547,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.734375,
      "step": 749,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9996674060821533
    },
    {
      "episode": 12016,
      "epoch": 0.07199434398629136,
      "loss/policy_avg": 0.1489763706922531,
      "lr": 9.520705521472394e-06,
      "objective/entropy": -179.14523315429688,
      "objective/kl": 25.692440032958984,
      "objective/non_score_reward": -1.284622073173523,
      "objective/rlhf_reward": -3.19107700415128,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 1.4589556455612183,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.517578125,
      "step": 750,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9989516735076904
    },
    {
      "episode": 12032,
      "epoch": 0.07209020862542091,
      "loss/policy_avg": 0.06708867847919464,
      "lr": 9.520066462167689e-06,
      "objective/entropy": -56.47541427612305,
      "objective/kl": 42.95630645751953,
      "objective/non_score_reward": -2.147815227508545,
      "objective/rlhf_reward": -6.7664322808113795,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 6.856327056884766,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.650390625,
      "step": 751,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9969167709350586
    },
    {
      "episode": 12048,
      "epoch": 0.07218607326455045,
      "loss/policy_avg": 0.3973958194255829,
      "lr": 9.519427402862986e-06,
      "objective/entropy": -244.11431884765625,
      "objective/kl": 25.62933921813965,
      "objective/non_score_reward": -1.2814669609069824,
      "objective/rlhf_reward": -3.301039035591196,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 48.01885223388672,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.671875,
      "step": 752,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9983410835266113
    },
    {
      "episode": 12064,
      "epoch": 0.07228193790368001,
      "loss/policy_avg": 0.016892850399017334,
      "lr": 9.518788343558283e-06,
      "objective/entropy": -233.80613708496094,
      "objective/kl": 33.0050048828125,
      "objective/non_score_reward": -1.6502504348754883,
      "objective/rlhf_reward": -4.653590510563786,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 15.416328430175781,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.560546875,
      "step": 753,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.999025821685791
    },
    {
      "episode": 12080,
      "epoch": 0.07237780254280955,
      "loss/policy_avg": 0.10087546706199646,
      "lr": 9.51814928425358e-06,
      "objective/entropy": -283.5254211425781,
      "objective/kl": 25.051952362060547,
      "objective/non_score_reward": -1.2525975704193115,
      "objective/rlhf_reward": -2.6103905797004696,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 19.29462432861328,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6328125,
      "step": 754,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.9981729984283447
    },
    {
      "episode": 12096,
      "epoch": 0.07247366718193911,
      "loss/policy_avg": 0.24108710885047913,
      "lr": 9.517510224948877e-06,
      "objective/entropy": -211.13575744628906,
      "objective/kl": 35.66078186035156,
      "objective/non_score_reward": -1.7830390930175781,
      "objective/rlhf_reward": -5.708324392040339,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 11.15980339050293,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.60546875,
      "step": 755,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9982430934906006
    },
    {
      "episode": 12112,
      "epoch": 0.07256953182106865,
      "loss/policy_avg": 0.6718421578407288,
      "lr": 9.516871165644172e-06,
      "objective/entropy": -148.00872802734375,
      "objective/kl": 30.348403930664062,
      "objective/non_score_reward": -1.5174202919006348,
      "objective/rlhf_reward": -4.669681048393249,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 24.264657974243164,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.626953125,
      "step": 756,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9989105463027954
    },
    {
      "episode": 12128,
      "epoch": 0.0726653964601982,
      "loss/policy_avg": 0.17684796452522278,
      "lr": 9.516232106339469e-06,
      "objective/entropy": -220.75283813476562,
      "objective/kl": 18.81310272216797,
      "objective/non_score_reward": -0.9406551122665405,
      "objective/rlhf_reward": -2.3840183998025477,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 3.753880500793457,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.498046875,
      "step": 757,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9982280731201172
    },
    {
      "episode": 12144,
      "epoch": 0.07276126109932775,
      "loss/policy_avg": 0.5594636797904968,
      "lr": 9.515593047034765e-06,
      "objective/entropy": -182.7705535888672,
      "objective/kl": 19.829849243164062,
      "objective/non_score_reward": -0.991492509841919,
      "objective/rlhf_reward": 0.4340301394462589,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 28.46674346923828,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.572265625,
      "step": 758,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9991915225982666
    },
    {
      "episode": 12160,
      "epoch": 0.0728571257384573,
      "loss/policy_avg": 0.6502060890197754,
      "lr": 9.514953987730062e-06,
      "objective/entropy": -112.33629608154297,
      "objective/kl": 39.52580642700195,
      "objective/non_score_reward": -1.9762903451919556,
      "objective/rlhf_reward": -5.78245514847425,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 4.3783769607543945,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.57421875,
      "step": 759,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9975318908691406
    },
    {
      "episode": 12176,
      "epoch": 0.07295299037758685,
      "loss/policy_avg": 0.88495934009552,
      "lr": 9.51431492842536e-06,
      "objective/entropy": -201.14666748046875,
      "objective/kl": 27.90923309326172,
      "objective/non_score_reward": -1.3954615592956543,
      "objective/rlhf_reward": -4.240210583716064,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 3.1258697509765625,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.66796875,
      "step": 760,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0020923614501953
    },
    {
      "episode": 12192,
      "epoch": 0.0730488550167164,
      "loss/policy_avg": 0.3271714448928833,
      "lr": 9.513675869120656e-06,
      "objective/entropy": -236.55361938476562,
      "objective/kl": 28.77971839904785,
      "objective/non_score_reward": -1.43898606300354,
      "objective/rlhf_reward": -4.35594413280487,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 5.469420909881592,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6171875,
      "step": 761,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.997791051864624
    },
    {
      "episode": 12208,
      "epoch": 0.07314471965584594,
      "loss/policy_avg": 0.0032866448163986206,
      "lr": 9.513036809815951e-06,
      "objective/entropy": -200.22227478027344,
      "objective/kl": 28.73204803466797,
      "objective/non_score_reward": -1.4366023540496826,
      "objective/rlhf_reward": -4.142289552752095,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 0.5752939581871033,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.587890625,
      "step": 762,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 2.0023627281188965
    },
    {
      "episode": 12224,
      "epoch": 0.0732405842949755,
      "loss/policy_avg": 0.21868771314620972,
      "lr": 9.512397750511248e-06,
      "objective/entropy": -187.9447784423828,
      "objective/kl": 20.44854736328125,
      "objective/non_score_reward": -1.0224274396896362,
      "objective/rlhf_reward": -2.5739379761540255,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 11.789055824279785,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6328125,
      "step": 763,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9979248046875
    },
    {
      "episode": 12240,
      "epoch": 0.07333644893410504,
      "loss/policy_avg": 0.3879333734512329,
      "lr": 9.511758691206545e-06,
      "objective/entropy": -267.96685791015625,
      "objective/kl": 28.91057586669922,
      "objective/non_score_reward": -1.4455287456512451,
      "objective/rlhf_reward": -3.8347037536668136,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 4.813044548034668,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.6953125,
      "step": 764,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0009827613830566
    },
    {
      "episode": 12256,
      "epoch": 0.0734323135732346,
      "loss/policy_avg": 0.06569409370422363,
      "lr": 9.511119631901842e-06,
      "objective/entropy": -207.83352661132812,
      "objective/kl": 24.208805084228516,
      "objective/non_score_reward": -1.2104402780532837,
      "objective/rlhf_reward": -3.2855019261508733,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 2.039762258529663,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.658203125,
      "step": 765,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.0008108615875244
    },
    {
      "episode": 12272,
      "epoch": 0.07352817821236414,
      "loss/policy_avg": 0.9109029769897461,
      "lr": 9.510480572597139e-06,
      "objective/entropy": -85.82101440429688,
      "objective/kl": 31.18517303466797,
      "objective/non_score_reward": -1.5592585802078247,
      "objective/rlhf_reward": -4.50370092789332,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 4.987689018249512,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.56640625,
      "step": 766,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.997962236404419
    },
    {
      "episode": 12288,
      "epoch": 0.0736240428514937,
      "loss/policy_avg": 0.44006603956222534,
      "lr": 9.509841513292434e-06,
      "objective/entropy": -254.5596923828125,
      "objective/kl": 26.123559951782227,
      "objective/non_score_reward": -1.3061779737472534,
      "objective/rlhf_reward": -3.6684524109035284,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 13.005337715148926,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.552734375,
      "step": 767,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9978489875793457
    },
    {
      "episode": 12304,
      "epoch": 0.07371990749062324,
      "loss/policy_avg": 0.14191022515296936,
      "lr": 9.509202453987731e-06,
      "objective/entropy": -185.1569061279297,
      "objective/kl": 38.093666076660156,
      "objective/non_score_reward": -1.9046835899353027,
      "objective/rlhf_reward": -7.618734002113342,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 60.80290603637695,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.640625,
      "step": 768,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9955925941467285
    },
    {
      "episode": 12320,
      "epoch": 0.0738157721297528,
      "loss/policy_avg": -0.31537145376205444,
      "lr": 9.508563394683026e-06,
      "objective/entropy": -164.9215087890625,
      "objective/kl": 30.594449996948242,
      "objective/non_score_reward": -1.5297224521636963,
      "objective/rlhf_reward": -4.63793725055015,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 1.2754226922988892,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.6171875,
      "step": 769,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.001909017562866
    },
    {
      "episode": 12336,
      "epoch": 0.07391163676888234,
      "loss/policy_avg": 0.034731436520814896,
      "lr": 9.507924335378323e-06,
      "objective/entropy": -200.43959045410156,
      "objective/kl": 36.4830436706543,
      "objective/non_score_reward": -1.8241522312164307,
      "objective/rlhf_reward": -5.8727765872078805,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 3.3153905868530273,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.626953125,
      "step": 770,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0031118392944336
    },
    {
      "episode": 12352,
      "epoch": 0.07400750140801189,
      "loss/policy_avg": 0.29965466260910034,
      "lr": 9.50728527607362e-06,
      "objective/entropy": -168.58261108398438,
      "objective/kl": 34.881736755371094,
      "objective/non_score_reward": -1.7440869808197021,
      "objective/rlhf_reward": -5.314488296926605,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 12.419918060302734,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.744140625,
      "step": 771,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999511957168579
    },
    {
      "episode": 12368,
      "epoch": 0.07410336604714143,
      "loss/policy_avg": 0.5840628743171692,
      "lr": 9.506646216768917e-06,
      "objective/entropy": -149.50210571289062,
      "objective/kl": 26.40768051147461,
      "objective/non_score_reward": -1.3203840255737305,
      "objective/rlhf_reward": -3.8005837230042214,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 10.453241348266602,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.560546875,
      "step": 772,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9990813732147217
    },
    {
      "episode": 12384,
      "epoch": 0.07419923068627099,
      "loss/policy_avg": -0.20146791636943817,
      "lr": 9.506007157464214e-06,
      "objective/entropy": -206.66688537597656,
      "objective/kl": 25.146541595458984,
      "objective/non_score_reward": -1.2573271989822388,
      "objective/rlhf_reward": -5.029308795928955,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 55.61228561401367,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.728515625,
      "step": 773,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9993964433670044
    },
    {
      "episode": 12400,
      "epoch": 0.07429509532540053,
      "loss/policy_avg": 2.0998456478118896,
      "lr": 9.50536809815951e-06,
      "objective/entropy": -135.09249877929688,
      "objective/kl": 26.86371612548828,
      "objective/non_score_reward": -1.3431859016418457,
      "objective/rlhf_reward": -3.922145526023254,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 7.190234184265137,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.548828125,
      "step": 774,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.0000576972961426
    },
    {
      "episode": 12416,
      "epoch": 0.07439095996453009,
      "loss/policy_avg": 0.024284163489937782,
      "lr": 9.504729038854806e-06,
      "objective/entropy": -269.6484375,
      "objective/kl": 21.226428985595703,
      "objective/non_score_reward": -1.061321496963501,
      "objective/rlhf_reward": -2.7946879669145197,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 6.07242488861084,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.66796875,
      "step": 775,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.999997615814209
    },
    {
      "episode": 12432,
      "epoch": 0.07448682460365963,
      "loss/policy_avg": 0.03317616134881973,
      "lr": 9.504089979550103e-06,
      "objective/entropy": -234.43389892578125,
      "objective/kl": 27.79866600036621,
      "objective/non_score_reward": -1.3899333477020264,
      "objective/rlhf_reward": -3.8264001766840616,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 1.3638486862182617,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.689453125,
      "step": 776,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9996023178100586
    },
    {
      "episode": 12448,
      "epoch": 0.07458268924278919,
      "loss/policy_avg": 0.16213266551494598,
      "lr": 9.5034509202454e-06,
      "objective/entropy": -203.708740234375,
      "objective/kl": 38.612911224365234,
      "objective/non_score_reward": -1.9306457042694092,
      "objective/rlhf_reward": -6.271984438510284,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 1.698218584060669,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.64453125,
      "step": 777,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9995825290679932
    },
    {
      "episode": 12464,
      "epoch": 0.07467855388191873,
      "loss/policy_avg": 0.2597602605819702,
      "lr": 9.502811860940696e-06,
      "objective/entropy": -250.4356231689453,
      "objective/kl": 30.581310272216797,
      "objective/non_score_reward": -1.529065489768982,
      "objective/rlhf_reward": -4.737660029021603,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 8.781853675842285,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.615234375,
      "step": 778,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9989944696426392
    },
    {
      "episode": 12480,
      "epoch": 0.07477441852104828,
      "loss/policy_avg": -0.24061758816242218,
      "lr": 9.502172801635993e-06,
      "objective/entropy": -98.61205291748047,
      "objective/kl": 26.375612258911133,
      "objective/non_score_reward": -1.3187806606292725,
      "objective/rlhf_reward": -3.794169786389231,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 5.955351829528809,
      "policy/clipfrac_avg": 2.0,
      "policy/entropy_avg": 0.7265625,
      "step": 779,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.003558397293091
    },
    {
      "episode": 12496,
      "epoch": 0.07487028316017783,
      "loss/policy_avg": 0.48288995027542114,
      "lr": 9.50153374233129e-06,
      "objective/entropy": -230.7918701171875,
      "objective/kl": 37.52941131591797,
      "objective/non_score_reward": -1.8764704465866089,
      "objective/rlhf_reward": -6.024929526265025,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 12.408464431762695,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.68359375,
      "step": 780,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9990174770355225
    },
    {
      "episode": 12512,
      "epoch": 0.07496614779930738,
      "loss/policy_avg": 0.27871203422546387,
      "lr": 9.500894683026585e-06,
      "objective/entropy": -159.85903930664062,
      "objective/kl": 25.038909912109375,
      "objective/non_score_reward": -1.2519454956054688,
      "objective/rlhf_reward": -2.607781863212585,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 46.26438903808594,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.658203125,
      "step": 781,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.000070095062256
    },
    {
      "episode": 12528,
      "epoch": 0.07506201243843692,
      "loss/policy_avg": 0.06291055679321289,
      "lr": 9.500255623721882e-06,
      "objective/entropy": -163.0406494140625,
      "objective/kl": 27.101749420166016,
      "objective/non_score_reward": -1.3550875186920166,
      "objective/rlhf_reward": -4.061099970076961,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 13.61475658416748,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.63671875,
      "step": 782,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9986028671264648
    },
    {
      "episode": 12544,
      "epoch": 0.07515787707756648,
      "loss/policy_avg": 0.07766500115394592,
      "lr": 9.499616564417179e-06,
      "objective/entropy": -264.68377685546875,
      "objective/kl": 26.38882827758789,
      "objective/non_score_reward": -1.319441556930542,
      "objective/rlhf_reward": -2.3540468558084697,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 29.816272735595703,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62890625,
      "step": 783,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9993165731430054
    },
    {
      "episode": 12560,
      "epoch": 0.07525374171669602,
      "loss/policy_avg": -0.25779616832733154,
      "lr": 9.498977505112476e-06,
      "objective/entropy": -192.4373016357422,
      "objective/kl": 30.569807052612305,
      "objective/non_score_reward": -1.528490424156189,
      "objective/rlhf_reward": -4.5098417139688305,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 3.409776210784912,
      "policy/clipfrac_avg": 2.0,
      "policy/entropy_avg": 0.765625,
      "step": 784,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0031237602233887
    },
    {
      "episode": 12576,
      "epoch": 0.07534960635582558,
      "loss/policy_avg": -0.23182180523872375,
      "lr": 9.498338445807773e-06,
      "objective/entropy": -116.57367706298828,
      "objective/kl": 30.319534301757812,
      "objective/non_score_reward": -1.5159766674041748,
      "objective/rlhf_reward": -4.704657160972042,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 3.2308108806610107,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.3955078125,
      "step": 785,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.001164436340332
    },
    {
      "episode": 12592,
      "epoch": 0.07544547099495512,
      "loss/policy_avg": 0.270114541053772,
      "lr": 9.497699386503068e-06,
      "objective/entropy": -213.6279296875,
      "objective/kl": 34.02395248413086,
      "objective/non_score_reward": -1.701197624206543,
      "objective/rlhf_reward": -3.8810713633310527,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 4.159467697143555,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.556640625,
      "step": 786,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999849557876587
    },
    {
      "episode": 12608,
      "epoch": 0.07554133563408467,
      "loss/policy_avg": 0.01593317836523056,
      "lr": 9.497060327198365e-06,
      "objective/entropy": -83.6307601928711,
      "objective/kl": 28.397233963012695,
      "objective/non_score_reward": -1.4198617935180664,
      "objective/rlhf_reward": -4.198494317944407,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 13.974614143371582,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.576171875,
      "step": 787,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9974963665008545
    },
    {
      "episode": 12624,
      "epoch": 0.07563720027321422,
      "loss/policy_avg": 0.122782863676548,
      "lr": 9.496421267893662e-06,
      "objective/entropy": -66.27203369140625,
      "objective/kl": 20.0443115234375,
      "objective/non_score_reward": -1.0022156238555908,
      "objective/rlhf_reward": -2.6302602673448146,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 5.128955364227295,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.78125,
      "step": 788,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.002767562866211
    },
    {
      "episode": 12640,
      "epoch": 0.07573306491234377,
      "loss/policy_avg": 0.06789802759885788,
      "lr": 9.495782208588959e-06,
      "objective/entropy": -174.1296844482422,
      "objective/kl": 28.25243377685547,
      "objective/non_score_reward": -1.4126217365264893,
      "objective/rlhf_reward": -3.988627438963042,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 21.132152557373047,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.802734375,
      "step": 789,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0010154247283936
    },
    {
      "episode": 12656,
      "epoch": 0.07582892955147331,
      "loss/policy_avg": 0.1666814684867859,
      "lr": 9.495143149284254e-06,
      "objective/entropy": -226.70257568359375,
      "objective/kl": 28.976097106933594,
      "objective/non_score_reward": -1.4488048553466797,
      "objective/rlhf_reward": -4.371387500961391,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 2.0613138675689697,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.62890625,
      "step": 790,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9994125366210938
    },
    {
      "episode": 12672,
      "epoch": 0.07592479419060287,
      "loss/policy_avg": 0.1284073442220688,
      "lr": 9.49450408997955e-06,
      "objective/entropy": -215.84002685546875,
      "objective/kl": 28.486852645874023,
      "objective/non_score_reward": -1.4243427515029907,
      "objective/rlhf_reward": -4.355735114126831,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 5.659012317657471,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.64453125,
      "step": 791,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0002176761627197
    },
    {
      "episode": 12688,
      "epoch": 0.07602065882973241,
      "loss/policy_avg": -0.04723303020000458,
      "lr": 9.493865030674848e-06,
      "objective/entropy": -227.61280822753906,
      "objective/kl": 28.772476196289062,
      "objective/non_score_reward": -1.4386236667633057,
      "objective/rlhf_reward": -2.830775891185972,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 1.8349313735961914,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.609375,
      "step": 792,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.002286911010742
    },
    {
      "episode": 12704,
      "epoch": 0.07611652346886197,
      "loss/policy_avg": -0.01974731869995594,
      "lr": 9.493225971370144e-06,
      "objective/entropy": -168.45291137695312,
      "objective/kl": 32.674957275390625,
      "objective/non_score_reward": -1.633747935295105,
      "objective/rlhf_reward": -5.209478828936739,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 0.8098639249801636,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6015625,
      "step": 793,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0004916191101074
    },
    {
      "episode": 12720,
      "epoch": 0.07621238810799151,
      "loss/policy_avg": 0.3524478077888489,
      "lr": 9.49258691206544e-06,
      "objective/entropy": -170.04669189453125,
      "objective/kl": 35.1775016784668,
      "objective/non_score_reward": -1.7588751316070557,
      "objective/rlhf_reward": -5.479241101947382,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 4.70783805847168,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.658203125,
      "step": 794,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9981024265289307
    },
    {
      "episode": 12736,
      "epoch": 0.07630825274712107,
      "loss/policy_avg": 0.14937232434749603,
      "lr": 9.491947852760736e-06,
      "objective/entropy": -258.00518798828125,
      "objective/kl": 30.382396697998047,
      "objective/non_score_reward": -1.5191197395324707,
      "objective/rlhf_reward": -4.472358975473957,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 8.522323608398438,
      "policy/clipfrac_avg": 0.25,
      "policy/entropy_avg": 0.68359375,
      "step": 795,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.00205659866333
    },
    {
      "episode": 12752,
      "epoch": 0.07640411738625061,
      "loss/policy_avg": 0.4101511240005493,
      "lr": 9.491308793456033e-06,
      "objective/entropy": -97.3719482421875,
      "objective/kl": 49.89447021484375,
      "objective/non_score_reward": -2.4947237968444824,
      "objective/rlhf_reward": -7.578894591331482,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 19.377134323120117,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.498046875,
      "step": 796,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9981412887573242
    },
    {
      "episode": 12768,
      "epoch": 0.07649998202538016,
      "loss/policy_avg": -0.0627971962094307,
      "lr": 9.49066973415133e-06,
      "objective/entropy": -110.8655776977539,
      "objective/kl": 44.73468017578125,
      "objective/non_score_reward": -2.23673415184021,
      "objective/rlhf_reward": -6.546936726570129,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 5.804272651672363,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.806640625,
      "step": 797,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9971017837524414
    },
    {
      "episode": 12784,
      "epoch": 0.0765958466645097,
      "loss/policy_avg": 0.3731452226638794,
      "lr": 9.490030674846627e-06,
      "objective/entropy": -15.07757568359375,
      "objective/kl": 24.15683364868164,
      "objective/non_score_reward": -1.2078416347503662,
      "objective/rlhf_reward": -2.4313664793968197,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 5.745340347290039,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.638671875,
      "step": 798,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9993362426757812
    },
    {
      "episode": 12800,
      "epoch": 0.07669171130363926,
      "loss/policy_avg": 0.3336324691772461,
      "lr": 9.489391615541922e-06,
      "objective/entropy": -249.59414672851562,
      "objective/kl": 28.68617820739746,
      "objective/non_score_reward": -1.4343090057373047,
      "objective/rlhf_reward": -2.8135166510355205,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 3.9479708671569824,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.693359375,
      "step": 799,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9993374347686768
    },
    {
      "episode": 12816,
      "epoch": 0.0767875759427688,
      "loss/policy_avg": 0.12261458486318588,
      "lr": 9.488752556237219e-06,
      "objective/entropy": -207.68580627441406,
      "objective/kl": 33.91386413574219,
      "objective/non_score_reward": -1.6956932544708252,
      "objective/rlhf_reward": -5.301820400174021,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 25.18114471435547,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.625,
      "step": 800,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9971894025802612
    },
    {
      "episode": 12832,
      "epoch": 0.07688344058189836,
      "loss/policy_avg": 0.1192292720079422,
      "lr": 9.488113496932516e-06,
      "objective/entropy": -268.4300842285156,
      "objective/kl": 26.710205078125,
      "objective/non_score_reward": -1.3355103731155396,
      "objective/rlhf_reward": -4.000405719786316,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 4.064979553222656,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.638671875,
      "step": 801,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9994006156921387
    },
    {
      "episode": 12848,
      "epoch": 0.0769793052210279,
      "loss/policy_avg": 0.4274081587791443,
      "lr": 9.487474437627813e-06,
      "objective/entropy": -125.00625610351562,
      "objective/kl": 36.30561065673828,
      "objective/non_score_reward": -1.815280795097351,
      "objective/rlhf_reward": -5.901873194907589,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 9.215574264526367,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.55078125,
      "step": 802,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999779224395752
    },
    {
      "episode": 12864,
      "epoch": 0.07707516986015746,
      "loss/policy_avg": 0.02082793414592743,
      "lr": 9.48683537832311e-06,
      "objective/entropy": 49.048545837402344,
      "objective/kl": 31.830245971679688,
      "objective/non_score_reward": -1.5915122032165527,
      "objective/rlhf_reward": -4.915450672717437,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 2.6811680793762207,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.4384765625,
      "step": 803,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 1,
      "val/ratio": 1.9994103908538818
    },
    {
      "episode": 12880,
      "epoch": 0.077171034499287,
      "loss/policy_avg": 0.1582624763250351,
      "lr": 9.486196319018407e-06,
      "objective/entropy": -110.25260925292969,
      "objective/kl": 31.00435447692871,
      "objective/non_score_reward": -1.550217866897583,
      "objective/rlhf_reward": -3.8008712291717526,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 3.5253429412841797,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.55859375,
      "step": 804,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0027151107788086
    },
    {
      "episode": 12896,
      "epoch": 0.07726689913841656,
      "loss/policy_avg": 0.09249435365200043,
      "lr": 9.485557259713702e-06,
      "objective/entropy": -203.63662719726562,
      "objective/kl": 31.04816436767578,
      "objective/non_score_reward": -1.552408218383789,
      "objective/rlhf_reward": -4.547773247182952,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 1.3485993146896362,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.646484375,
      "step": 805,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.999431848526001
    },
    {
      "episode": 12912,
      "epoch": 0.0773627637775461,
      "loss/policy_avg": 0.44563794136047363,
      "lr": 9.484918200408999e-06,
      "objective/entropy": -163.74508666992188,
      "objective/kl": 31.982746124267578,
      "objective/non_score_reward": -1.599137306213379,
      "objective/rlhf_reward": -3.472829972149107,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 87.72571563720703,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.70703125,
      "step": 806,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.001553535461426
    },
    {
      "episode": 12928,
      "epoch": 0.07745862841667565,
      "loss/policy_avg": -0.017649848014116287,
      "lr": 9.484279141104296e-06,
      "objective/entropy": -266.5451965332031,
      "objective/kl": 27.058134078979492,
      "objective/non_score_reward": -1.3529068231582642,
      "objective/rlhf_reward": -1.0116270542144772,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 5.037982940673828,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.654296875,
      "step": 807,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0039749145507812
    },
    {
      "episode": 12944,
      "epoch": 0.07755449305580521,
      "loss/policy_avg": 5.042888641357422,
      "lr": 9.483640081799592e-06,
      "objective/entropy": -212.65740966796875,
      "objective/kl": 24.790084838867188,
      "objective/non_score_reward": -1.2395042181015015,
      "objective/rlhf_reward": -3.4770642546967263,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 11.046760559082031,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.533203125,
      "step": 808,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0002431869506836
    },
    {
      "episode": 12960,
      "epoch": 0.07765035769493475,
      "loss/policy_avg": -0.07623002678155899,
      "lr": 9.48300102249489e-06,
      "objective/entropy": -167.7131805419922,
      "objective/kl": 31.204689025878906,
      "objective/non_score_reward": -1.5602343082427979,
      "objective/rlhf_reward": -4.790339152427062,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 5.110037803649902,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5234375,
      "step": 809,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9989296197891235
    },
    {
      "episode": 12976,
      "epoch": 0.07774622233406431,
      "loss/policy_avg": 0.0697702169418335,
      "lr": 9.482361963190185e-06,
      "objective/entropy": -99.56057739257812,
      "objective/kl": 40.95980453491211,
      "objective/non_score_reward": -2.047990322113037,
      "objective/rlhf_reward": -5.268241856933805,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 1.0177828073501587,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.595703125,
      "step": 810,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9999544620513916
    },
    {
      "episode": 12992,
      "epoch": 0.07784208697319385,
      "loss/policy_avg": 0.011765815317630768,
      "lr": 9.481722903885481e-06,
      "objective/entropy": -270.2078857421875,
      "objective/kl": 32.53266906738281,
      "objective/non_score_reward": -1.6266334056854248,
      "objective/rlhf_reward": -4.950274675098017,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 10.882495880126953,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.685546875,
      "step": 811,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.997708797454834
    },
    {
      "episode": 13008,
      "epoch": 0.0779379516123234,
      "loss/policy_avg": 0.4012794494628906,
      "lr": 9.481083844580777e-06,
      "objective/entropy": -139.22914123535156,
      "objective/kl": 37.05573272705078,
      "objective/non_score_reward": -1.8527867794036865,
      "objective/rlhf_reward": -5.586318249973367,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 210.83877563476562,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6640625,
      "step": 812,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.001636505126953
    },
    {
      "episode": 13024,
      "epoch": 0.07803381625145295,
      "loss/policy_avg": 0.2699980139732361,
      "lr": 9.480444785276073e-06,
      "objective/entropy": -196.59963989257812,
      "objective/kl": 30.699893951416016,
      "objective/non_score_reward": -1.5349947214126587,
      "objective/rlhf_reward": -3.7399788856506344,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 2.332146167755127,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.619140625,
      "step": 813,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9989255666732788
    },
    {
      "episode": 13040,
      "epoch": 0.0781296808905825,
      "loss/policy_avg": 0.20207370817661285,
      "lr": 9.47980572597137e-06,
      "objective/entropy": -267.2593994140625,
      "objective/kl": 33.34029006958008,
      "objective/non_score_reward": -1.6670145988464355,
      "objective/rlhf_reward": -5.342545185118837,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 1.632169246673584,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.564453125,
      "step": 814,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9993088245391846
    },
    {
      "episode": 13056,
      "epoch": 0.07822554552971205,
      "loss/policy_avg": 0.1745888739824295,
      "lr": 9.479166666666667e-06,
      "objective/entropy": -108.20680236816406,
      "objective/kl": 35.203025817871094,
      "objective/non_score_reward": -1.7601512670516968,
      "objective/rlhf_reward": -5.484345762935236,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 8.32550048828125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.736328125,
      "step": 815,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0030176639556885
    },
    {
      "episode": 13072,
      "epoch": 0.0783214101688416,
      "loss/policy_avg": 0.2600640654563904,
      "lr": 9.478527607361964e-06,
      "objective/entropy": -204.03048706054688,
      "objective/kl": 40.41114807128906,
      "objective/non_score_reward": -2.020557403564453,
      "objective/rlhf_reward": -6.74059360316339,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 4.140628814697266,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.619140625,
      "step": 816,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0000810623168945
    },
    {
      "episode": 13088,
      "epoch": 0.07841727480797114,
      "loss/policy_avg": 0.5273202061653137,
      "lr": 9.477888548057261e-06,
      "objective/entropy": -241.156494140625,
      "objective/kl": 24.541404724121094,
      "objective/non_score_reward": -1.2270702123641968,
      "objective/rlhf_reward": -3.246421401918517,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 14.965031623840332,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.61328125,
      "step": 817,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9990627765655518
    },
    {
      "episode": 13104,
      "epoch": 0.0785131394471007,
      "loss/policy_avg": -0.09151424467563629,
      "lr": 9.477249488752556e-06,
      "objective/entropy": -219.21754455566406,
      "objective/kl": 31.261905670166016,
      "objective/non_score_reward": -1.5630953311920166,
      "objective/rlhf_reward": -4.4275525763359775,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 2.8227334022521973,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.833984375,
      "step": 818,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0036370754241943
    },
    {
      "episode": 13120,
      "epoch": 0.07860900408623024,
      "loss/policy_avg": 0.13953115046024323,
      "lr": 9.476610429447853e-06,
      "objective/entropy": -186.8937530517578,
      "objective/kl": 27.69632339477539,
      "objective/non_score_reward": -1.3848161697387695,
      "objective/rlhf_reward": -3.1392647981643673,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 3.2056455612182617,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6015625,
      "step": 819,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 9,
      "val/ratio": 2.003466844558716
    },
    {
      "episode": 13136,
      "epoch": 0.0787048687253598,
      "loss/policy_avg": 0.6420396566390991,
      "lr": 9.47597137014315e-06,
      "objective/entropy": -134.00025939941406,
      "objective/kl": 22.993852615356445,
      "objective/non_score_reward": -1.1496926546096802,
      "objective/rlhf_reward": -2.651359389500554,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 3.607414722442627,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.611328125,
      "step": 820,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.000748634338379
    },
    {
      "episode": 13152,
      "epoch": 0.07880073336448934,
      "loss/policy_avg": 0.08356916159391403,
      "lr": 9.475332310838447e-06,
      "objective/entropy": -189.72003173828125,
      "objective/kl": 26.506973266601562,
      "objective/non_score_reward": -1.3253486156463623,
      "objective/rlhf_reward": -3.959759166746765,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 4.290050029754639,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62890625,
      "step": 821,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9971721172332764
    },
    {
      "episode": 13168,
      "epoch": 0.0788965980036189,
      "loss/policy_avg": 0.11917827278375626,
      "lr": 9.474693251533744e-06,
      "objective/entropy": -207.30722045898438,
      "objective/kl": 35.41877746582031,
      "objective/non_score_reward": -1.7709391117095947,
      "objective/rlhf_reward": -5.683756327629089,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 6.870448112487793,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.57421875,
      "step": 822,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9955227375030518
    },
    {
      "episode": 13184,
      "epoch": 0.07899246264274844,
      "loss/policy_avg": -0.3528624475002289,
      "lr": 9.474054192229039e-06,
      "objective/entropy": -138.19627380371094,
      "objective/kl": 27.491954803466797,
      "objective/non_score_reward": -1.3745976686477661,
      "objective/rlhf_reward": -4.156755199938446,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 1.025694727897644,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.52734375,
      "step": 823,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0017786026000977
    },
    {
      "episode": 13200,
      "epoch": 0.079088327281878,
      "loss/policy_avg": 0.3965766727924347,
      "lr": 9.473415132924336e-06,
      "objective/entropy": -244.5587921142578,
      "objective/kl": 28.361434936523438,
      "objective/non_score_reward": -1.4180717468261719,
      "objective/rlhf_reward": -3.549580337778602,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 22.821792602539062,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.53515625,
      "step": 824,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.997540831565857
    },
    {
      "episode": 13216,
      "epoch": 0.07918419192100754,
      "loss/policy_avg": 0.183881938457489,
      "lr": 9.472776073619633e-06,
      "objective/entropy": -235.81063842773438,
      "objective/kl": 35.635047912597656,
      "objective/non_score_reward": -1.7817524671554565,
      "objective/rlhf_reward": -5.785373976736694,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 13.993101119995117,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.681640625,
      "step": 825,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9987282752990723
    },
    {
      "episode": 13232,
      "epoch": 0.07928005656013709,
      "loss/policy_avg": 0.13472305238246918,
      "lr": 9.47213701431493e-06,
      "objective/entropy": -209.61251831054688,
      "objective/kl": 32.511722564697266,
      "objective/non_score_reward": -1.6255862712860107,
      "objective/rlhf_reward": -5.176832351714296,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 10.514575958251953,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.693359375,
      "step": 826,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9980220794677734
    },
    {
      "episode": 13248,
      "epoch": 0.07937592119926663,
      "loss/policy_avg": 0.28974202275276184,
      "lr": 9.471497955010226e-06,
      "objective/entropy": -277.55413818359375,
      "objective/kl": 23.343517303466797,
      "objective/non_score_reward": -1.1671757698059082,
      "objective/rlhf_reward": -4.668703377246857,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 4.868777275085449,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.595703125,
      "step": 827,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9984409809112549
    },
    {
      "episode": 13264,
      "epoch": 0.07947178583839619,
      "loss/policy_avg": 0.0649593323469162,
      "lr": 9.470858895705523e-06,
      "objective/entropy": -168.09161376953125,
      "objective/kl": 32.58544921875,
      "objective/non_score_reward": -1.6292723417282104,
      "objective/rlhf_reward": -5.001317584308323,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 13.682709693908691,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.630859375,
      "step": 828,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9994233846664429
    },
    {
      "episode": 13280,
      "epoch": 0.07956765047752573,
      "loss/policy_avg": 0.01223127543926239,
      "lr": 9.470219836400818e-06,
      "objective/entropy": -24.52312469482422,
      "objective/kl": 37.070613861083984,
      "objective/non_score_reward": -1.8535306453704834,
      "objective/rlhf_reward": -5.963524679751739,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 1.4948031902313232,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.591796875,
      "step": 829,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0006215572357178
    },
    {
      "episode": 13296,
      "epoch": 0.07966351511665529,
      "loss/policy_avg": 0.08012821525335312,
      "lr": 9.469580777096115e-06,
      "objective/entropy": -222.74710083007812,
      "objective/kl": 29.31437873840332,
      "objective/non_score_reward": -1.4657189846038818,
      "objective/rlhf_reward": -5.862875819206238,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 7.948197364807129,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.720703125,
      "step": 830,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.999361515045166
    },
    {
      "episode": 13312,
      "epoch": 0.07975937975578483,
      "loss/policy_avg": 0.25244101881980896,
      "lr": 9.468941717791412e-06,
      "objective/entropy": -256.2400817871094,
      "objective/kl": 25.82564926147461,
      "objective/non_score_reward": -1.2912824153900146,
      "objective/rlhf_reward": -5.165129542350769,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 25.767894744873047,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.625,
      "step": 831,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.9958332777023315
    },
    {
      "episode": 13328,
      "epoch": 0.07985524439491438,
      "loss/policy_avg": 0.20151713490486145,
      "lr": 9.468302658486709e-06,
      "objective/entropy": -176.53012084960938,
      "objective/kl": 31.989328384399414,
      "objective/non_score_reward": -1.5994665622711182,
      "objective/rlhf_reward": -4.94726787051712,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 6.573209762573242,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.65234375,
      "step": 832,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0001397132873535
    },
    {
      "episode": 13344,
      "epoch": 0.07995110903404393,
      "loss/policy_avg": 0.40637868642807007,
      "lr": 9.467663599182006e-06,
      "objective/entropy": -157.83944702148438,
      "objective/kl": 26.236248016357422,
      "objective/non_score_reward": -1.311812400817871,
      "objective/rlhf_reward": -2.847249662876129,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 41.408966064453125,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.69921875,
      "step": 833,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9979543685913086
    },
    {
      "episode": 13360,
      "epoch": 0.08004697367317348,
      "loss/policy_avg": 0.4117756485939026,
      "lr": 9.467024539877301e-06,
      "objective/entropy": -154.52528381347656,
      "objective/kl": 34.40885925292969,
      "objective/non_score_reward": -1.7204430103302002,
      "objective/rlhf_reward": -3.958053027034971,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 5.24909782409668,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.552734375,
      "step": 834,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9996882677078247
    },
    {
      "episode": 13376,
      "epoch": 0.08014283831230302,
      "loss/policy_avg": 0.25968849658966064,
      "lr": 9.466385480572598e-06,
      "objective/entropy": -35.48725509643555,
      "objective/kl": 48.416969299316406,
      "objective/non_score_reward": -2.4208483695983887,
      "objective/rlhf_reward": -8.324143612121029,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 7.6608781814575195,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4736328125,
      "step": 835,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9974427223205566
    },
    {
      "episode": 13392,
      "epoch": 0.08023870295143258,
      "loss/policy_avg": 0.6013174057006836,
      "lr": 9.465746421267893e-06,
      "objective/entropy": -131.218994140625,
      "objective/kl": 40.460113525390625,
      "objective/non_score_reward": -2.023005723953247,
      "objective/rlhf_reward": -6.267194564613412,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 5.2574968338012695,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.765625,
      "step": 836,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9985507726669312
    },
    {
      "episode": 13408,
      "epoch": 0.08033456759056212,
      "loss/policy_avg": 0.024118170142173767,
      "lr": 9.46510736196319e-06,
      "objective/entropy": -219.2191162109375,
      "objective/kl": 37.4605712890625,
      "objective/non_score_reward": -1.8730283975601196,
      "objective/rlhf_reward": -5.544702480511601,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 4.715839385986328,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.603515625,
      "step": 837,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.003450393676758
    },
    {
      "episode": 13424,
      "epoch": 0.08043043222969168,
      "loss/policy_avg": 0.3022631108760834,
      "lr": 9.464468302658487e-06,
      "objective/entropy": -122.02997589111328,
      "objective/kl": 32.87577438354492,
      "objective/non_score_reward": -1.6437886953353882,
      "objective/rlhf_reward": -5.196552612868649,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 3.3451852798461914,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6328125,
      "step": 838,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9992682933807373
    },
    {
      "episode": 13440,
      "epoch": 0.08052629686882122,
      "loss/policy_avg": 0.09435372054576874,
      "lr": 9.463829243353784e-06,
      "objective/entropy": -228.3193817138672,
      "objective/kl": 27.057086944580078,
      "objective/non_score_reward": -1.3528543710708618,
      "objective/rlhf_reward": -3.7495579771405323,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 64.43006896972656,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.642578125,
      "step": 839,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9953296184539795
    },
    {
      "episode": 13456,
      "epoch": 0.08062216150795078,
      "loss/policy_avg": 1.2935261726379395,
      "lr": 9.46319018404908e-06,
      "objective/entropy": -160.080322265625,
      "objective/kl": 34.4007568359375,
      "objective/non_score_reward": -1.7200379371643066,
      "objective/rlhf_reward": -5.538515916376738,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 131.64187622070312,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.6015625,
      "step": 840,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9983408451080322
    },
    {
      "episode": 13472,
      "epoch": 0.08071802614708032,
      "loss/policy_avg": 0.5178288817405701,
      "lr": 9.462551124744378e-06,
      "objective/entropy": -140.98907470703125,
      "objective/kl": 32.42417526245117,
      "objective/non_score_reward": -1.621208906173706,
      "objective/rlhf_reward": -4.084835386276245,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 2.9638893604278564,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7421875,
      "step": 841,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9993921518325806
    },
    {
      "episode": 13488,
      "epoch": 0.08081389078620987,
      "loss/policy_avg": 1.674887776374817,
      "lr": 9.461912065439673e-06,
      "objective/entropy": -140.6572723388672,
      "objective/kl": 33.64493179321289,
      "objective/non_score_reward": -1.682246446609497,
      "objective/rlhf_reward": -2.3289861440658566,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 2.7393760681152344,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.73828125,
      "step": 842,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0029544830322266
    },
    {
      "episode": 13504,
      "epoch": 0.08090975542533942,
      "loss/policy_avg": 0.10809826105833054,
      "lr": 9.46127300613497e-06,
      "objective/entropy": 33.49109649658203,
      "objective/kl": 46.121177673339844,
      "objective/non_score_reward": -2.3060591220855713,
      "objective/rlhf_reward": -7.399407501491616,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 10.247078895568848,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.7578125,
      "step": 843,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9997997283935547
    },
    {
      "episode": 13520,
      "epoch": 0.08100562006446897,
      "loss/policy_avg": 0.08235388994216919,
      "lr": 9.460633946830267e-06,
      "objective/entropy": -232.94918823242188,
      "objective/kl": 29.242427825927734,
      "objective/non_score_reward": -1.4621214866638184,
      "objective/rlhf_reward": -5.848485827445984,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 7.9668121337890625,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.708984375,
      "step": 844,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.998790979385376
    },
    {
      "episode": 13536,
      "epoch": 0.08110148470359851,
      "loss/policy_avg": 0.15575401484966278,
      "lr": 9.459994887525563e-06,
      "objective/entropy": -230.47235107421875,
      "objective/kl": 39.588829040527344,
      "objective/non_score_reward": -1.9794416427612305,
      "objective/rlhf_reward": -6.401994669231113,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 3.700314521789551,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.708984375,
      "step": 845,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9979462623596191
    },
    {
      "episode": 13552,
      "epoch": 0.08119734934272807,
      "loss/policy_avg": 0.13659973442554474,
      "lr": 9.45935582822086e-06,
      "objective/entropy": -174.33474731445312,
      "objective/kl": 28.351903915405273,
      "objective/non_score_reward": -1.4175951480865479,
      "objective/rlhf_reward": -2.746661697269651,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 9.24754524230957,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.705078125,
      "step": 846,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0036792755126953
    },
    {
      "episode": 13568,
      "epoch": 0.08129321398185761,
      "loss/policy_avg": -0.0010715574026107788,
      "lr": 9.458716768916156e-06,
      "objective/entropy": -106.94636535644531,
      "objective/kl": 43.695289611816406,
      "objective/non_score_reward": -2.1847643852233887,
      "objective/rlhf_reward": -7.077198391378509,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 4.114851474761963,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5859375,
      "step": 847,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0022311210632324
    },
    {
      "episode": 13584,
      "epoch": 0.08138907862098717,
      "loss/policy_avg": -0.020745811983942986,
      "lr": 9.458077709611452e-06,
      "objective/entropy": -274.30377197265625,
      "objective/kl": 29.099441528320312,
      "objective/non_score_reward": -1.4549720287322998,
      "objective/rlhf_reward": -4.215768191877919,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 5.374234199523926,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.55078125,
      "step": 848,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0002601146698
    },
    {
      "episode": 13600,
      "epoch": 0.08148494326011671,
      "loss/policy_avg": 0.08369505405426025,
      "lr": 9.45743865030675e-06,
      "objective/entropy": -90.9344482421875,
      "objective/kl": 32.62782669067383,
      "objective/non_score_reward": -1.6313912868499756,
      "objective/rlhf_reward": -4.921445462767201,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 2.873699426651001,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.681640625,
      "step": 849,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9983861446380615
    },
    {
      "episode": 13616,
      "epoch": 0.08158080789924627,
      "loss/policy_avg": 0.12610237300395966,
      "lr": 9.456799591002046e-06,
      "objective/entropy": -216.01071166992188,
      "objective/kl": 31.95155906677246,
      "objective/non_score_reward": -1.5975778102874756,
      "objective/rlhf_reward": -5.048675945311218,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 17.690187454223633,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.630859375,
      "step": 850,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9975669384002686
    },
    {
      "episode": 13632,
      "epoch": 0.08167667253837581,
      "loss/policy_avg": 0.09207138419151306,
      "lr": 9.456160531697343e-06,
      "objective/entropy": -213.504638671875,
      "objective/kl": 33.958152770996094,
      "objective/non_score_reward": -1.69790780544281,
      "objective/rlhf_reward": -5.413029053298336,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 2.278407096862793,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.712890625,
      "step": 851,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9982938766479492
    },
    {
      "episode": 13648,
      "epoch": 0.08177253717750536,
      "loss/policy_avg": 0.7879657745361328,
      "lr": 9.45552147239264e-06,
      "objective/entropy": -179.40536499023438,
      "objective/kl": 38.20147705078125,
      "objective/non_score_reward": -1.91007399559021,
      "objective/rlhf_reward": -6.216463644702998,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 3.275893211364746,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.52734375,
      "step": 852,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.000800609588623
    },
    {
      "episode": 13664,
      "epoch": 0.0818684018166349,
      "loss/policy_avg": -0.05168546736240387,
      "lr": 9.454882413087935e-06,
      "objective/entropy": -252.6636505126953,
      "objective/kl": 36.603004455566406,
      "objective/non_score_reward": -1.8301501274108887,
      "objective/rlhf_reward": -5.65874100250064,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 1.8799333572387695,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.734375,
      "step": 853,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.000197410583496
    },
    {
      "episode": 13680,
      "epoch": 0.08196426645576446,
      "loss/policy_avg": 0.35540589690208435,
      "lr": 9.454243353783232e-06,
      "objective/entropy": -263.326171875,
      "objective/kl": 31.936683654785156,
      "objective/non_score_reward": -1.5968341827392578,
      "objective/rlhf_reward": -5.0618239379226395,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 9.10447883605957,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.591796875,
      "step": 854,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9981427192687988
    },
    {
      "episode": 13696,
      "epoch": 0.082060131094894,
      "loss/policy_avg": -0.01314299926161766,
      "lr": 9.453604294478529e-06,
      "objective/entropy": -50.54448699951172,
      "objective/kl": 27.010623931884766,
      "objective/non_score_reward": -1.3505312204360962,
      "objective/rlhf_reward": -4.002124941349029,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 72.71121215820312,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.755859375,
      "step": 855,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9988174438476562
    },
    {
      "episode": 13712,
      "epoch": 0.08215599573402356,
      "loss/policy_avg": 0.507459282875061,
      "lr": 9.452965235173824e-06,
      "objective/entropy": -196.7661590576172,
      "objective/kl": 41.39533615112305,
      "objective/non_score_reward": -2.0697667598724365,
      "objective/rlhf_reward": -6.331655929760869,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 6.633426666259766,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.734375,
      "step": 856,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9997053146362305
    },
    {
      "episode": 13728,
      "epoch": 0.0822518603731531,
      "loss/policy_avg": 0.01022842712700367,
      "lr": 9.452326175869121e-06,
      "objective/entropy": -165.575439453125,
      "objective/kl": 28.162111282348633,
      "objective/non_score_reward": -1.408105492591858,
      "objective/rlhf_reward": -4.253819801894528,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 2.566072463989258,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.673828125,
      "step": 857,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9994425773620605
    },
    {
      "episode": 13744,
      "epoch": 0.08234772501228266,
      "loss/policy_avg": 0.5199975371360779,
      "lr": 9.451687116564418e-06,
      "objective/entropy": -191.289794921875,
      "objective/kl": 25.639848709106445,
      "objective/non_score_reward": -1.2819924354553223,
      "objective/rlhf_reward": -3.6121978996121253,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 2.2938754558563232,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.4873046875,
      "step": 858,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9993865489959717
    },
    {
      "episode": 13760,
      "epoch": 0.0824435896514122,
      "loss/policy_avg": -0.09089094400405884,
      "lr": 9.451048057259715e-06,
      "objective/entropy": -222.6432647705078,
      "objective/kl": 35.101905822753906,
      "objective/non_score_reward": -1.7550954818725586,
      "objective/rlhf_reward": -5.641779580203396,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 1.5215179920196533,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.701171875,
      "step": 859,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0004208087921143
    },
    {
      "episode": 13776,
      "epoch": 0.08253945429054176,
      "loss/policy_avg": 0.3994244635105133,
      "lr": 9.45040899795501e-06,
      "objective/entropy": -232.05795288085938,
      "objective/kl": 35.13083267211914,
      "objective/non_score_reward": -1.7565417289733887,
      "objective/rlhf_reward": -2.6261669158935543,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 7.337094306945801,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.736328125,
      "step": 860,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.0000622272491455
    },
    {
      "episode": 13792,
      "epoch": 0.0826353189296713,
      "loss/policy_avg": 0.241072878241539,
      "lr": 9.449769938650307e-06,
      "objective/entropy": -235.5231475830078,
      "objective/kl": 42.96981430053711,
      "objective/non_score_reward": -2.1484906673431396,
      "objective/rlhf_reward": -6.860629336039224,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 5.666136264801025,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.681640625,
      "step": 861,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9980902671813965
    },
    {
      "episode": 13808,
      "epoch": 0.08273118356880085,
      "loss/policy_avg": 0.06892701238393784,
      "lr": 9.449130879345604e-06,
      "objective/entropy": -43.37392044067383,
      "objective/kl": 28.94279670715332,
      "objective/non_score_reward": -1.447139859199524,
      "objective/rlhf_reward": -4.446923902540832,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 13.682140350341797,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.873046875,
      "step": 862,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.998504638671875
    },
    {
      "episode": 13824,
      "epoch": 0.0828270482079304,
      "loss/policy_avg": 0.05104389786720276,
      "lr": 9.4484918200409e-06,
      "objective/entropy": -274.24462890625,
      "objective/kl": 26.58008575439453,
      "objective/non_score_reward": -1.3290044069290161,
      "objective/rlhf_reward": -3.6541578821545704,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 14.673041343688965,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6796875,
      "step": 863,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.998407244682312
    },
    {
      "episode": 13840,
      "epoch": 0.08292291284705995,
      "loss/policy_avg": 2.0433521270751953,
      "lr": 9.447852760736197e-06,
      "objective/entropy": -141.08175659179688,
      "objective/kl": 38.66474151611328,
      "objective/non_score_reward": -1.933237075805664,
      "objective/rlhf_reward": -6.282350401492462,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 3.4866771697998047,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.849609375,
      "step": 864,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0020241737365723
    },
    {
      "episode": 13856,
      "epoch": 0.08301877748618951,
      "loss/policy_avg": 0.5822303891181946,
      "lr": 9.447213701431494e-06,
      "objective/entropy": -75.44483184814453,
      "objective/kl": 42.41981506347656,
      "objective/non_score_reward": -2.1209909915924072,
      "objective/rlhf_reward": -7.033365587802276,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 1.0502395629882812,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.763671875,
      "step": 865,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0002496242523193
    },
    {
      "episode": 13872,
      "epoch": 0.08311464212531905,
      "loss/policy_avg": 1.5961978435516357,
      "lr": 9.44657464212679e-06,
      "objective/entropy": -102.62336730957031,
      "objective/kl": 32.63710021972656,
      "objective/non_score_reward": -1.6318550109863281,
      "objective/rlhf_reward": -4.702591176303934,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 44.34449005126953,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.583984375,
      "step": 866,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.997666597366333
    },
    {
      "episode": 13888,
      "epoch": 0.0832105067644486,
      "loss/policy_avg": -0.06377097964286804,
      "lr": 9.445935582822086e-06,
      "objective/entropy": -179.53016662597656,
      "objective/kl": 27.1846981048584,
      "objective/non_score_reward": -1.3592349290847778,
      "objective/rlhf_reward": -3.3142334840455394,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 11.25791072845459,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.67578125,
      "step": 867,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.001521110534668
    },
    {
      "episode": 13904,
      "epoch": 0.08330637140357815,
      "loss/policy_avg": 0.06122337281703949,
      "lr": 9.445296523517383e-06,
      "objective/entropy": -160.8975830078125,
      "objective/kl": 37.28607940673828,
      "objective/non_score_reward": -1.8643040657043457,
      "objective/rlhf_reward": -6.131703171759767,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 2.380110263824463,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.685546875,
      "step": 868,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9993302822113037
    },
    {
      "episode": 13920,
      "epoch": 0.0834022360427077,
      "loss/policy_avg": 0.06397978216409683,
      "lr": 9.44465746421268e-06,
      "objective/entropy": -279.75146484375,
      "objective/kl": 36.54051971435547,
      "objective/non_score_reward": -1.8270260095596313,
      "objective/rlhf_reward": -5.3606928093003585,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 9.166413307189941,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6953125,
      "step": 869,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.998555064201355
    },
    {
      "episode": 13936,
      "epoch": 0.08349810068183725,
      "loss/policy_avg": 0.18339544534683228,
      "lr": 9.444018404907977e-06,
      "objective/entropy": -197.06088256835938,
      "objective/kl": 35.413883209228516,
      "objective/non_score_reward": -1.7706942558288574,
      "objective/rlhf_reward": -5.420917516172516,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 2.4228196144104004,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.55859375,
      "step": 870,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9997130632400513
    },
    {
      "episode": 13952,
      "epoch": 0.0835939653209668,
      "loss/policy_avg": 0.7395508885383606,
      "lr": 9.443379345603272e-06,
      "objective/entropy": -175.5420684814453,
      "objective/kl": 27.310260772705078,
      "objective/non_score_reward": -1.3655130863189697,
      "objective/rlhf_reward": -3.9057928611903936,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 20.016393661499023,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.744140625,
      "step": 871,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9980931282043457
    },
    {
      "episode": 13968,
      "epoch": 0.08368982996009634,
      "loss/policy_avg": 0.11419187486171722,
      "lr": 9.442740286298569e-06,
      "objective/entropy": -202.19219970703125,
      "objective/kl": 26.73446273803711,
      "objective/non_score_reward": -1.3367230892181396,
      "objective/rlhf_reward": -0.9468923568725582,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 1.4593892097473145,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.505859375,
      "step": 872,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9997605085372925
    },
    {
      "episode": 13984,
      "epoch": 0.0837856945992259,
      "loss/policy_avg": 0.10254265367984772,
      "lr": 9.442101226993866e-06,
      "objective/entropy": -181.49607849121094,
      "objective/kl": 34.489620208740234,
      "objective/non_score_reward": -1.7244811058044434,
      "objective/rlhf_reward": -5.2938043213525585,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 10.685236930847168,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.572265625,
      "step": 873,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0004138946533203
    },
    {
      "episode": 14000,
      "epoch": 0.08388155923835544,
      "loss/policy_avg": -0.11048807948827744,
      "lr": 9.441462167689163e-06,
      "objective/entropy": -233.58718872070312,
      "objective/kl": 27.196325302124023,
      "objective/non_score_reward": -1.359816312789917,
      "objective/rlhf_reward": -4.080015146468563,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 7.074767112731934,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.599609375,
      "step": 874,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.000337600708008
    },
    {
      "episode": 14016,
      "epoch": 0.083977423877485,
      "loss/policy_avg": -0.04991217330098152,
      "lr": 9.44082310838446e-06,
      "objective/entropy": -147.29574584960938,
      "objective/kl": 39.145992279052734,
      "objective/non_score_reward": -1.9572995901107788,
      "objective/rlhf_reward": -6.429198360443115,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 2.3655714988708496,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.703125,
      "step": 875,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0024516582489014
    },
    {
      "episode": 14032,
      "epoch": 0.08407328851661454,
      "loss/policy_avg": 0.018214020878076553,
      "lr": 9.440184049079757e-06,
      "objective/entropy": -225.25274658203125,
      "objective/kl": 28.496929168701172,
      "objective/non_score_reward": -1.4248464107513428,
      "objective/rlhf_reward": -4.248787502856597,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 8.280494689941406,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.78515625,
      "step": 876,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.0028023719787598
    },
    {
      "episode": 14048,
      "epoch": 0.0841691531557441,
      "loss/policy_avg": -0.0712839737534523,
      "lr": 9.439544989775052e-06,
      "objective/entropy": -111.49925231933594,
      "objective/kl": 33.307395935058594,
      "objective/non_score_reward": -1.6653697490692139,
      "objective/rlhf_reward": -5.237647135456172,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 10.649118423461914,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.744140625,
      "step": 877,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0005428791046143
    },
    {
      "episode": 14064,
      "epoch": 0.08426501779487364,
      "loss/policy_avg": 0.416260302066803,
      "lr": 9.438905930470349e-06,
      "objective/entropy": -91.5921630859375,
      "objective/kl": 36.07551193237305,
      "objective/non_score_reward": -1.8037755489349365,
      "objective/rlhf_reward": -5.658842890468195,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 14.971528053283691,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.546875,
      "step": 878,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.99779212474823
    },
    {
      "episode": 14080,
      "epoch": 0.08436088243400319,
      "loss/policy_avg": 0.6945221424102783,
      "lr": 9.438266871165644e-06,
      "objective/entropy": -103.2996597290039,
      "objective/kl": 29.02838706970215,
      "objective/non_score_reward": -1.4514193534851074,
      "objective/rlhf_reward": -4.249418287482813,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 3.5951104164123535,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6015625,
      "step": 879,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9996235370635986
    },
    {
      "episode": 14096,
      "epoch": 0.08445674707313273,
      "loss/policy_avg": 0.14096970856189728,
      "lr": 9.43762781186094e-06,
      "objective/entropy": -250.6915283203125,
      "objective/kl": 24.03522491455078,
      "objective/non_score_reward": -1.2017613649368286,
      "objective/rlhf_reward": -3.4284433508790553,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 141.8468017578125,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.681640625,
      "step": 880,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9993736743927002
    },
    {
      "episode": 14112,
      "epoch": 0.08455261171226229,
      "loss/policy_avg": 0.3699185848236084,
      "lr": 9.436988752556238e-06,
      "objective/entropy": -159.3045196533203,
      "objective/kl": 40.019386291503906,
      "objective/non_score_reward": -2.000969409942627,
      "objective/rlhf_reward": -6.553278903575286,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 8.20317554473877,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.775390625,
      "step": 881,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9977927207946777
    },
    {
      "episode": 14128,
      "epoch": 0.08464847635139183,
      "loss/policy_avg": 0.41995298862457275,
      "lr": 9.436349693251534e-06,
      "objective/entropy": 76.95626068115234,
      "objective/kl": 39.00627899169922,
      "objective/non_score_reward": -1.9503138065338135,
      "objective/rlhf_reward": -6.139395838201629,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 31.75859832763672,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.568359375,
      "step": 882,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 1.9953019618988037
    },
    {
      "episode": 14144,
      "epoch": 0.08474434099052139,
      "loss/policy_avg": 0.5355075597763062,
      "lr": 9.435710633946831e-06,
      "objective/entropy": -164.35186767578125,
      "objective/kl": 42.27740478515625,
      "objective/non_score_reward": -2.113870143890381,
      "objective/rlhf_reward": -7.113845041304259,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 20.66805648803711,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.72265625,
      "step": 883,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.9979139566421509
    },
    {
      "episode": 14160,
      "epoch": 0.08484020562965093,
      "loss/policy_avg": 0.12046757340431213,
      "lr": 9.435071574642126e-06,
      "objective/entropy": -139.48226928710938,
      "objective/kl": 35.96446228027344,
      "objective/non_score_reward": -1.7982230186462402,
      "objective/rlhf_reward": -5.833642208312435,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 5.584999084472656,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.763671875,
      "step": 884,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9985802173614502
    },
    {
      "episode": 14176,
      "epoch": 0.08493607026878049,
      "loss/policy_avg": 0.20259422063827515,
      "lr": 9.434432515337423e-06,
      "objective/entropy": -194.32472229003906,
      "objective/kl": 29.422592163085938,
      "objective/non_score_reward": -1.4711295366287231,
      "objective/rlhf_reward": -4.151184813181559,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 7.590093612670898,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.705078125,
      "step": 885,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.0000078678131104
    },
    {
      "episode": 14192,
      "epoch": 0.08503193490791003,
      "loss/policy_avg": 0.38378089666366577,
      "lr": 9.43379345603272e-06,
      "objective/entropy": -204.80718994140625,
      "objective/kl": 26.858444213867188,
      "objective/non_score_reward": -1.3429222106933594,
      "objective/rlhf_reward": -3.947856862743465,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 54.279869079589844,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.8828125,
      "step": 886,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.000415086746216
    },
    {
      "episode": 14208,
      "epoch": 0.08512779954703958,
      "loss/policy_avg": 0.27804744243621826,
      "lr": 9.433154396728017e-06,
      "objective/entropy": -216.76026916503906,
      "objective/kl": 31.35245704650879,
      "objective/non_score_reward": -1.5676229000091553,
      "objective/rlhf_reward": -4.928855529337554,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 44.15214157104492,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.841796875,
      "step": 887,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9985570907592773
    },
    {
      "episode": 14224,
      "epoch": 0.08522366418616913,
      "loss/policy_avg": 0.1285010725259781,
      "lr": 9.432515337423314e-06,
      "objective/entropy": -256.2292175292969,
      "objective/kl": 22.457351684570312,
      "objective/non_score_reward": -1.1228675842285156,
      "objective/rlhf_reward": -2.5440589291619613,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 2.694319725036621,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.591796875,
      "step": 888,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 1.9982428550720215
    },
    {
      "episode": 14240,
      "epoch": 0.08531952882529868,
      "loss/policy_avg": 0.1620079129934311,
      "lr": 9.431876278118611e-06,
      "objective/entropy": -246.3665313720703,
      "objective/kl": 32.27862548828125,
      "objective/non_score_reward": -1.6139311790466309,
      "objective/rlhf_reward": -5.03189285536584,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 4.128833293914795,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.626953125,
      "step": 889,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0017335414886475
    },
    {
      "episode": 14256,
      "epoch": 0.08541539346442822,
      "loss/policy_avg": 0.6714350581169128,
      "lr": 9.431237218813906e-06,
      "objective/entropy": -87.00444793701172,
      "objective/kl": 30.12160873413086,
      "objective/non_score_reward": -1.5060807466506958,
      "objective/rlhf_reward": -4.600490768154231,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 30.543041229248047,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.681640625,
      "step": 890,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9955544471740723
    },
    {
      "episode": 14272,
      "epoch": 0.08551125810355778,
      "loss/policy_avg": 0.5368032455444336,
      "lr": 9.430598159509203e-06,
      "objective/entropy": -151.2410125732422,
      "objective/kl": 23.1306095123291,
      "objective/non_score_reward": -1.1565306186676025,
      "objective/rlhf_reward": -3.266872340176983,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 18.648775100708008,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62890625,
      "step": 891,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.999136209487915
    },
    {
      "episode": 14288,
      "epoch": 0.08560712274268732,
      "loss/policy_avg": -0.4043048024177551,
      "lr": 9.4299591002045e-06,
      "objective/entropy": -214.12281799316406,
      "objective/kl": 38.173484802246094,
      "objective/non_score_reward": -1.9086743593215942,
      "objective/rlhf_reward": -5.972837810934173,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 3.6675243377685547,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.580078125,
      "step": 892,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.000812292098999
    },
    {
      "episode": 14304,
      "epoch": 0.08570298738181688,
      "loss/policy_avg": 1.0885683298110962,
      "lr": 9.429320040899797e-06,
      "objective/entropy": -234.37998962402344,
      "objective/kl": 27.995094299316406,
      "objective/non_score_reward": -1.3997547626495361,
      "objective/rlhf_reward": -4.239769363139553,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 1.2649099826812744,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.6640625,
      "step": 893,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0003528594970703
    },
    {
      "episode": 14320,
      "epoch": 0.08579885202094642,
      "loss/policy_avg": -0.1013278141617775,
      "lr": 9.428680981595094e-06,
      "objective/entropy": -156.33245849609375,
      "objective/kl": 35.587982177734375,
      "objective/non_score_reward": -1.779399037361145,
      "objective/rlhf_reward": -5.738993861762387,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 8.389669418334961,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.666015625,
      "step": 894,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.99942946434021
    },
    {
      "episode": 14336,
      "epoch": 0.08589471666007598,
      "loss/policy_avg": -0.006531273480504751,
      "lr": 9.42804192229039e-06,
      "objective/entropy": -197.26820373535156,
      "objective/kl": 21.04766082763672,
      "objective/non_score_reward": -1.0523829460144043,
      "objective/rlhf_reward": -2.6532727172046453,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 1.4280903339385986,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.69140625,
      "step": 895,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.001786708831787
    },
    {
      "episode": 14352,
      "epoch": 0.08599058129920552,
      "loss/policy_avg": 0.10259456932544708,
      "lr": 9.427402862985686e-06,
      "objective/entropy": -120.49540710449219,
      "objective/kl": 37.17432403564453,
      "objective/non_score_reward": -1.858716368675232,
      "objective/rlhf_reward": -3.034865355491638,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 6.6070685386657715,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.615234375,
      "step": 896,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.999729871749878
    },
    {
      "episode": 14368,
      "epoch": 0.08608644593833507,
      "loss/policy_avg": 0.18344524502754211,
      "lr": 9.426763803680982e-06,
      "objective/entropy": -84.0172348022461,
      "objective/kl": 32.38622283935547,
      "objective/non_score_reward": -1.6193112134933472,
      "objective/rlhf_reward": -5.117994987700863,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 33.82829284667969,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.529296875,
      "step": 897,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 9,
      "val/ratio": 1.9998817443847656
    },
    {
      "episode": 14384,
      "epoch": 0.08618231057746462,
      "loss/policy_avg": 0.7863380312919617,
      "lr": 9.42612474437628e-06,
      "objective/entropy": -94.4057388305664,
      "objective/kl": 31.75823402404785,
      "objective/non_score_reward": -1.58791184425354,
      "objective/rlhf_reward": -3.4279281839143962,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 3.406008243560791,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.8125,
      "step": 898,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0038909912109375
    },
    {
      "episode": 14400,
      "epoch": 0.08627817521659417,
      "loss/policy_avg": 0.5351603031158447,
      "lr": 9.425485685071576e-06,
      "objective/entropy": -265.2181396484375,
      "objective/kl": 29.21182632446289,
      "objective/non_score_reward": -1.460591197013855,
      "objective/rlhf_reward": -4.1090314547220865,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 4.054888725280762,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.57421875,
      "step": 899,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.998013973236084
    },
    {
      "episode": 14416,
      "epoch": 0.08637403985572371,
      "loss/policy_avg": 0.013534091413021088,
      "lr": 9.424846625766873e-06,
      "objective/entropy": -194.56564331054688,
      "objective/kl": 24.970386505126953,
      "objective/non_score_reward": -1.2485194206237793,
      "objective/rlhf_reward": -3.6348278162225913,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 0.42985397577285767,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.53515625,
      "step": 900,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0002646446228027
    },
    {
      "episode": 14432,
      "epoch": 0.08646990449485327,
      "loss/policy_avg": -0.026506464928388596,
      "lr": 9.424207566462168e-06,
      "objective/entropy": -121.82954406738281,
      "objective/kl": 38.97528839111328,
      "objective/non_score_reward": -1.9487645626068115,
      "objective/rlhf_reward": -7.795057892799377,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 18.97709846496582,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.3564453125,
      "step": 901,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9988110065460205
    },
    {
      "episode": 14448,
      "epoch": 0.08656576913398281,
      "loss/policy_avg": 0.04643288254737854,
      "lr": 9.423568507157465e-06,
      "objective/entropy": -97.38468170166016,
      "objective/kl": 28.042333602905273,
      "objective/non_score_reward": -1.4021167755126953,
      "objective/rlhf_reward": -4.184634823997585,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 2.1407618522644043,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.60546875,
      "step": 902,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9999892711639404
    },
    {
      "episode": 14464,
      "epoch": 0.08666163377311237,
      "loss/policy_avg": 0.5154027342796326,
      "lr": 9.42292944785276e-06,
      "objective/entropy": -250.2370147705078,
      "objective/kl": 25.91543960571289,
      "objective/non_score_reward": -1.2957720756530762,
      "objective/rlhf_reward": -3.759255845745174,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 1.9840008020401,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.607421875,
      "step": 903,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9984569549560547
    },
    {
      "episode": 14480,
      "epoch": 0.08675749841224191,
      "loss/policy_avg": -0.12090878188610077,
      "lr": 9.422290388548057e-06,
      "objective/entropy": -224.9342041015625,
      "objective/kl": 21.860130310058594,
      "objective/non_score_reward": -1.0930064916610718,
      "objective/rlhf_reward": -2.42461485691541,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 6.253545761108398,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.751953125,
      "step": 904,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.000030755996704
    },
    {
      "episode": 14496,
      "epoch": 0.08685336305137147,
      "loss/policy_avg": 0.2192097306251526,
      "lr": 9.421651329243354e-06,
      "objective/entropy": -116.75704956054688,
      "objective/kl": 40.641937255859375,
      "objective/non_score_reward": -2.0320968627929688,
      "objective/rlhf_reward": -6.786751320868163,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 3.1222383975982666,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.525390625,
      "step": 905,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9990143775939941
    },
    {
      "episode": 14512,
      "epoch": 0.08694922769050101,
      "loss/policy_avg": 0.014911421574652195,
      "lr": 9.421012269938651e-06,
      "objective/entropy": -169.34967041015625,
      "objective/kl": 19.47471809387207,
      "objective/non_score_reward": -0.9737359285354614,
      "objective/rlhf_reward": -1.7722373626389838,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 3.2120165824890137,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.65234375,
      "step": 906,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0021252632141113
    },
    {
      "episode": 14528,
      "epoch": 0.08704509232963056,
      "loss/policy_avg": -0.06861399114131927,
      "lr": 9.420373210633948e-06,
      "objective/entropy": -199.73748779296875,
      "objective/kl": 32.33728790283203,
      "objective/non_score_reward": -1.6168644428253174,
      "objective/rlhf_reward": -5.088855722037655,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 7.329561233520508,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.58203125,
      "step": 907,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.1414122581481934
    },
    {
      "episode": 14544,
      "epoch": 0.0871409569687601,
      "loss/policy_avg": -0.0006491807289421558,
      "lr": 9.419734151329245e-06,
      "objective/entropy": -241.5037078857422,
      "objective/kl": 26.676612854003906,
      "objective/non_score_reward": -1.3338308334350586,
      "objective/rlhf_reward": -0.9353229761123654,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 2.882882595062256,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.568359375,
      "step": 908,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9995675086975098
    },
    {
      "episode": 14560,
      "epoch": 0.08723682160788966,
      "loss/policy_avg": -0.30844664573669434,
      "lr": 9.41909509202454e-06,
      "objective/entropy": -193.48281860351562,
      "objective/kl": 32.22890853881836,
      "objective/non_score_reward": -1.6114455461502075,
      "objective/rlhf_reward": -4.712448493639627,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 9.586688995361328,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.537109375,
      "step": 909,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0032057762145996
    },
    {
      "episode": 14576,
      "epoch": 0.0873326862470192,
      "loss/policy_avg": 0.10456671565771103,
      "lr": 9.418456032719837e-06,
      "objective/entropy": -214.8862762451172,
      "objective/kl": 30.845359802246094,
      "objective/non_score_reward": -1.5422677993774414,
      "objective/rlhf_reward": -4.769071197509765,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 48.766883850097656,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.560546875,
      "step": 910,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.0017874240875244
    },
    {
      "episode": 14592,
      "epoch": 0.08742855088614876,
      "loss/policy_avg": 0.011322952806949615,
      "lr": 9.417816973415134e-06,
      "objective/entropy": -148.18869018554688,
      "objective/kl": 34.653785705566406,
      "objective/non_score_reward": -1.7326891422271729,
      "objective/rlhf_reward": -5.5069247080880075,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 2.303962230682373,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.740234375,
      "step": 911,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.001537561416626
    },
    {
      "episode": 14608,
      "epoch": 0.0875244155252783,
      "loss/policy_avg": 1.4446654319763184,
      "lr": 9.41717791411043e-06,
      "objective/entropy": -151.7039337158203,
      "objective/kl": 36.139678955078125,
      "objective/non_score_reward": -1.8069840669631958,
      "objective/rlhf_reward": -5.623816165987568,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 8.342704772949219,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7421875,
      "step": 912,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.997730016708374
    },
    {
      "episode": 14624,
      "epoch": 0.08762028016440786,
      "loss/policy_avg": 0.007501431740820408,
      "lr": 9.416538854805727e-06,
      "objective/entropy": -192.82723999023438,
      "objective/kl": 28.006526947021484,
      "objective/non_score_reward": -1.4003264904022217,
      "objective/rlhf_reward": -3.776477153572153,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 9.397720336914062,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.74609375,
      "step": 913,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9988361597061157
    },
    {
      "episode": 14640,
      "epoch": 0.0877161448035374,
      "loss/policy_avg": 0.7067223787307739,
      "lr": 9.415899795501023e-06,
      "objective/entropy": -199.13888549804688,
      "objective/kl": 40.245330810546875,
      "objective/non_score_reward": -2.0122666358947754,
      "objective/rlhf_reward": -6.387206798017608,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 1.6032171249389648,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.626953125,
      "step": 914,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0032029151916504
    },
    {
      "episode": 14656,
      "epoch": 0.08781200944266696,
      "loss/policy_avg": 0.7447987198829651,
      "lr": 9.41526073619632e-06,
      "objective/entropy": -192.03024291992188,
      "objective/kl": 33.84302520751953,
      "objective/non_score_reward": -1.6921510696411133,
      "objective/rlhf_reward": -5.212345330920771,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 12.58854866027832,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.708984375,
      "step": 915,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9982028007507324
    },
    {
      "episode": 14672,
      "epoch": 0.0879078740817965,
      "loss/policy_avg": -0.12448902428150177,
      "lr": 9.414621676891616e-06,
      "objective/entropy": -108.39199829101562,
      "objective/kl": 27.540185928344727,
      "objective/non_score_reward": -1.3770092725753784,
      "objective/rlhf_reward": -3.3853308580079418,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 0.6809393763542175,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.771484375,
      "step": 916,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0027718544006348
    },
    {
      "episode": 14688,
      "epoch": 0.08800373872092605,
      "loss/policy_avg": 0.09778769314289093,
      "lr": 9.413982617586913e-06,
      "objective/entropy": -83.20165252685547,
      "objective/kl": 27.68124008178711,
      "objective/non_score_reward": -1.3840619325637817,
      "objective/rlhf_reward": -3.7114191010323276,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 7.001269340515137,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6015625,
      "step": 917,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9994606971740723
    },
    {
      "episode": 14704,
      "epoch": 0.0880996033600556,
      "loss/policy_avg": 0.7267050743103027,
      "lr": 9.41334355828221e-06,
      "objective/entropy": -174.48663330078125,
      "objective/kl": 32.38935089111328,
      "objective/non_score_reward": -1.6194674968719482,
      "objective/rlhf_reward": -6.477869987487793,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 9.753436088562012,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.6796875,
      "step": 918,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9973618984222412
    },
    {
      "episode": 14720,
      "epoch": 0.08819546799918515,
      "loss/policy_avg": 0.18099595606327057,
      "lr": 9.412704498977507e-06,
      "objective/entropy": -232.4264373779297,
      "objective/kl": 37.20670700073242,
      "objective/non_score_reward": -1.860335350036621,
      "objective/rlhf_reward": -6.115828309088869,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 41.893341064453125,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.79296875,
      "step": 919,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.002889633178711
    },
    {
      "episode": 14736,
      "epoch": 0.08829133263831469,
      "loss/policy_avg": 0.43639302253723145,
      "lr": 9.412065439672802e-06,
      "objective/entropy": -183.69644165039062,
      "objective/kl": 24.13558006286621,
      "objective/non_score_reward": -1.2067790031433105,
      "objective/rlhf_reward": -2.8797047836350753,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 30.2447509765625,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.55078125,
      "step": 920,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9992506504058838
    },
    {
      "episode": 14752,
      "epoch": 0.08838719727744425,
      "loss/policy_avg": 0.5567411780357361,
      "lr": 9.411426380368099e-06,
      "objective/entropy": -285.06512451171875,
      "objective/kl": 32.89839553833008,
      "objective/non_score_reward": -1.644919753074646,
      "objective/rlhf_reward": -4.8463457385698945,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 18.15423583984375,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.609375,
      "step": 921,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.997804880142212
    },
    {
      "episode": 14768,
      "epoch": 0.0884830619165738,
      "loss/policy_avg": -0.024971559643745422,
      "lr": 9.410787321063396e-06,
      "objective/entropy": -144.00473022460938,
      "objective/kl": 27.061277389526367,
      "objective/non_score_reward": -1.353063941001892,
      "objective/rlhf_reward": -4.033653714743954,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 1.530630111694336,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62890625,
      "step": 922,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0000722408294678
    },
    {
      "episode": 14784,
      "epoch": 0.08857892655570335,
      "loss/policy_avg": -0.49618157744407654,
      "lr": 9.410148261758691e-06,
      "objective/entropy": -37.43824768066406,
      "objective/kl": 35.81788635253906,
      "objective/non_score_reward": -1.7908943891525269,
      "objective/rlhf_reward": -5.501717870653259,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 4.131357192993164,
      "policy/clipfrac_avg": 2.0,
      "policy/entropy_avg": 0.4287109375,
      "step": 923,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0173897743225098
    },
    {
      "episode": 14800,
      "epoch": 0.0886747911948329,
      "loss/policy_avg": 0.05783979594707489,
      "lr": 9.409509202453988e-06,
      "objective/entropy": -154.13516235351562,
      "objective/kl": 46.57448196411133,
      "objective/non_score_reward": -2.3287243843078613,
      "objective/rlhf_reward": -7.653037791669952,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 2.5200886726379395,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.5390625,
      "step": 924,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.9987819194793701
    },
    {
      "episode": 14816,
      "epoch": 0.08877065583396244,
      "loss/policy_avg": 0.034926094114780426,
      "lr": 9.408870143149285e-06,
      "objective/entropy": -221.52577209472656,
      "objective/kl": 35.47760772705078,
      "objective/non_score_reward": -1.7738804817199707,
      "objective/rlhf_reward": -5.614569070752024,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 1.4324332475662231,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.66796875,
      "step": 925,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9995288848876953
    },
    {
      "episode": 14832,
      "epoch": 0.088866520473092,
      "loss/policy_avg": 0.32427555322647095,
      "lr": 9.408231083844582e-06,
      "objective/entropy": -130.25445556640625,
      "objective/kl": 34.63972473144531,
      "objective/non_score_reward": -1.7319860458374023,
      "objective/rlhf_reward": -5.371684878078058,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 3.6408345699310303,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.568359375,
      "step": 926,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0012738704681396
    },
    {
      "episode": 14848,
      "epoch": 0.08896238511222154,
      "loss/policy_avg": -0.27763280272483826,
      "lr": 9.407592024539877e-06,
      "objective/entropy": -244.65667724609375,
      "objective/kl": 27.930646896362305,
      "objective/non_score_reward": -1.396532416343689,
      "objective/rlhf_reward": -3.7613009765473118,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 65.45894622802734,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.66015625,
      "step": 927,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.002328872680664
    },
    {
      "episode": 14864,
      "epoch": 0.0890582497513511,
      "loss/policy_avg": 0.39164024591445923,
      "lr": 9.406952965235174e-06,
      "objective/entropy": -92.6754150390625,
      "objective/kl": 40.35970687866211,
      "objective/non_score_reward": -2.0179853439331055,
      "objective/rlhf_reward": -5.148222361446592,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 9.222280502319336,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.587890625,
      "step": 928,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9965031147003174
    },
    {
      "episode": 14880,
      "epoch": 0.08915411439048064,
      "loss/policy_avg": 0.018820755183696747,
      "lr": 9.40631390593047e-06,
      "objective/entropy": -221.75802612304688,
      "objective/kl": 32.733848571777344,
      "objective/non_score_reward": -1.6366922855377197,
      "objective/rlhf_reward": -4.942649397913533,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 1.5601739883422852,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.67578125,
      "step": 929,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 2.0016555786132812
    },
    {
      "episode": 14896,
      "epoch": 0.0892499790296102,
      "loss/policy_avg": 0.02956710010766983,
      "lr": 9.405674846625768e-06,
      "objective/entropy": -225.1991729736328,
      "objective/kl": 27.00541114807129,
      "objective/non_score_reward": -1.3502705097198486,
      "objective/rlhf_reward": -4.059446623831421,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 6.755413055419922,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.8046875,
      "step": 930,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.001051664352417
    },
    {
      "episode": 14912,
      "epoch": 0.08934584366873974,
      "loss/policy_avg": 0.08642945438623428,
      "lr": 9.405035787321065e-06,
      "objective/entropy": -179.3356475830078,
      "objective/kl": 36.390193939208984,
      "objective/non_score_reward": -1.8195096254348755,
      "objective/rlhf_reward": -5.330627392010625,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 10.583852767944336,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.4501953125,
      "step": 931,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999453067779541
    },
    {
      "episode": 14928,
      "epoch": 0.0894417083078693,
      "loss/policy_avg": 0.08466912060976028,
      "lr": 9.404396728016361e-06,
      "objective/entropy": -160.34024047851562,
      "objective/kl": 48.99607849121094,
      "objective/non_score_reward": -2.4498043060302734,
      "objective/rlhf_reward": -8.195096407000142,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 0.9886335134506226,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.666015625,
      "step": 932,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 2.0025954246520996
    },
    {
      "episode": 14944,
      "epoch": 0.08953757294699884,
      "loss/policy_avg": 0.3508598804473877,
      "lr": 9.403757668711657e-06,
      "objective/entropy": -177.20993041992188,
      "objective/kl": 32.381324768066406,
      "objective/non_score_reward": -1.6190659999847412,
      "objective/rlhf_reward": -5.150751504927797,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 31.277324676513672,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.4716796875,
      "step": 933,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9997992515563965
    },
    {
      "episode": 14960,
      "epoch": 0.08963343758612839,
      "loss/policy_avg": 0.11015394330024719,
      "lr": 9.403118609406953e-06,
      "objective/entropy": -203.39776611328125,
      "objective/kl": 32.743534088134766,
      "objective/non_score_reward": -1.637176752090454,
      "objective/rlhf_reward": -4.94458726412447,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 2.4484572410583496,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.734375,
      "step": 934,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.9993422031402588
    },
    {
      "episode": 14976,
      "epoch": 0.08972930222525793,
      "loss/policy_avg": -0.10944172739982605,
      "lr": 9.40247955010225e-06,
      "objective/entropy": -241.4989013671875,
      "objective/kl": 21.90488052368164,
      "objective/non_score_reward": -1.0952439308166504,
      "objective/rlhf_reward": -3.0217259762033652,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 3.7654926776885986,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.689453125,
      "step": 935,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.002392292022705
    },
    {
      "episode": 14992,
      "epoch": 0.08982516686438749,
      "loss/policy_avg": 0.9405217170715332,
      "lr": 9.401840490797547e-06,
      "objective/entropy": -237.89816284179688,
      "objective/kl": 25.436769485473633,
      "objective/non_score_reward": -1.2718384265899658,
      "objective/rlhf_reward": -3.663521905143825,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 6.3816022872924805,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.60546875,
      "step": 936,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9998328685760498
    },
    {
      "episode": 15008,
      "epoch": 0.08992103150351703,
      "loss/policy_avg": 0.3327906131744385,
      "lr": 9.401201431492844e-06,
      "objective/entropy": -268.6925354003906,
      "objective/kl": 37.998870849609375,
      "objective/non_score_reward": -1.899943470954895,
      "objective/rlhf_reward": -6.0435144593387395,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 10.16036605834961,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.607421875,
      "step": 937,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9984662532806396
    },
    {
      "episode": 15024,
      "epoch": 0.09001689614264659,
      "loss/policy_avg": -0.26467132568359375,
      "lr": 9.40056237218814e-06,
      "objective/entropy": -231.59254455566406,
      "objective/kl": 26.266529083251953,
      "objective/non_score_reward": -1.3133264780044556,
      "objective/rlhf_reward": -3.737534248622593,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 8.63685417175293,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.6328125,
      "step": 938,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999743938446045
    },
    {
      "episode": 15040,
      "epoch": 0.09011276078177613,
      "loss/policy_avg": 0.2447420209646225,
      "lr": 9.399923312883436e-06,
      "objective/entropy": -278.01153564453125,
      "objective/kl": 27.628671646118164,
      "objective/non_score_reward": -1.3814334869384766,
      "objective/rlhf_reward": -4.147132196513516,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 2.7261061668395996,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.669921875,
      "step": 939,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9990365505218506
    },
    {
      "episode": 15056,
      "epoch": 0.09020862542090569,
      "loss/policy_avg": 0.2600797414779663,
      "lr": 9.399284253578733e-06,
      "objective/entropy": -242.6852264404297,
      "objective/kl": 40.91444396972656,
      "objective/non_score_reward": -2.045722484588623,
      "objective/rlhf_reward": -6.060183467642341,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 7.501818656921387,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6484375,
      "step": 940,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9971075057983398
    },
    {
      "episode": 15072,
      "epoch": 0.09030449006003523,
      "loss/policy_avg": 0.3729836940765381,
      "lr": 9.39864519427403e-06,
      "objective/entropy": -225.56338500976562,
      "objective/kl": 34.106658935546875,
      "objective/non_score_reward": -1.7053331136703491,
      "objective/rlhf_reward": -5.340379836972117,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 3.6144325733184814,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.73828125,
      "step": 941,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9977765083312988
    },
    {
      "episode": 15088,
      "epoch": 0.09040035469916478,
      "loss/policy_avg": 0.571183443069458,
      "lr": 9.398006134969327e-06,
      "objective/entropy": -109.51638793945312,
      "objective/kl": 57.49871826171875,
      "objective/non_score_reward": -2.8749358654022217,
      "objective/rlhf_reward": -9.895623478952961,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 5.06275749206543,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.736328125,
      "step": 942,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.000272512435913
    },
    {
      "episode": 15104,
      "epoch": 0.09049621933829433,
      "loss/policy_avg": 0.7253443002700806,
      "lr": 9.397367075664624e-06,
      "objective/entropy": -69.86570739746094,
      "objective/kl": 40.12030029296875,
      "objective/non_score_reward": -2.0060153007507324,
      "objective/rlhf_reward": -6.362201397836792,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 66.08172607421875,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6015625,
      "step": 943,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.997114896774292
    },
    {
      "episode": 15120,
      "epoch": 0.09059208397742388,
      "loss/policy_avg": 0.7548943758010864,
      "lr": 9.396728016359919e-06,
      "objective/entropy": -264.1029357910156,
      "objective/kl": 29.125934600830078,
      "objective/non_score_reward": -1.456296682357788,
      "objective/rlhf_reward": -4.268927424159601,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 1.555539846420288,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6328125,
      "step": 944,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0007224082946777
    },
    {
      "episode": 15136,
      "epoch": 0.09068794861655342,
      "loss/policy_avg": -0.06224450469017029,
      "lr": 9.396088957055216e-06,
      "objective/entropy": -215.80255126953125,
      "objective/kl": 36.1290283203125,
      "objective/non_score_reward": -1.8064515590667725,
      "objective/rlhf_reward": -5.7100345728718604,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 2.062628746032715,
      "policy/clipfrac_avg": 0.25,
      "policy/entropy_avg": 0.5703125,
      "step": 945,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0022828578948975
    },
    {
      "episode": 15152,
      "epoch": 0.09078381325568298,
      "loss/policy_avg": -0.34320878982543945,
      "lr": 9.395449897750511e-06,
      "objective/entropy": -254.14260864257812,
      "objective/kl": 24.163818359375,
      "objective/non_score_reward": -1.20819091796875,
      "objective/rlhf_reward": -2.4327639102935787,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 3.011139392852783,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.62109375,
      "step": 946,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0023422241210938
    },
    {
      "episode": 15168,
      "epoch": 0.09087967789481252,
      "loss/policy_avg": 0.08071097731590271,
      "lr": 9.394810838445808e-06,
      "objective/entropy": -269.91180419921875,
      "objective/kl": 29.857431411743164,
      "objective/non_score_reward": -1.4928715229034424,
      "objective/rlhf_reward": -3.8487801573434215,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 7.305149555206299,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.646484375,
      "step": 947,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9971578121185303
    },
    {
      "episode": 15184,
      "epoch": 0.09097554253394208,
      "loss/policy_avg": -0.019624732434749603,
      "lr": 9.394171779141105e-06,
      "objective/entropy": -274.10198974609375,
      "objective/kl": 33.219993591308594,
      "objective/non_score_reward": -1.6609996557235718,
      "objective/rlhf_reward": -6.643998503684998,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 4.708046913146973,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6015625,
      "step": 948,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.000514030456543
    },
    {
      "episode": 15200,
      "epoch": 0.09107140717307162,
      "loss/policy_avg": -0.5435956716537476,
      "lr": 9.393532719836402e-06,
      "objective/entropy": -245.58270263671875,
      "objective/kl": 26.876476287841797,
      "objective/non_score_reward": -1.3438239097595215,
      "objective/rlhf_reward": -3.771175924603062,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 10.178674697875977,
      "policy/clipfrac_avg": 1.75,
      "policy/entropy_avg": 0.630859375,
      "step": 949,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0033979415893555
    },
    {
      "episode": 15216,
      "epoch": 0.09116727181220118,
      "loss/policy_avg": 0.6083466410636902,
      "lr": 9.392893660531698e-06,
      "objective/entropy": -169.32357788085938,
      "objective/kl": 38.449127197265625,
      "objective/non_score_reward": -1.9224563837051392,
      "objective/rlhf_reward": -6.133565931525782,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 8.572129249572754,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.685546875,
      "step": 950,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.000035285949707
    },
    {
      "episode": 15232,
      "epoch": 0.09126313645133072,
      "loss/policy_avg": 0.1515914499759674,
      "lr": 9.392254601226994e-06,
      "objective/entropy": -181.75010681152344,
      "objective/kl": 31.95659637451172,
      "objective/non_score_reward": -1.5978299379348755,
      "objective/rlhf_reward": -5.04968385985437,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 12.761173248291016,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.798828125,
      "step": 951,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9977487325668335
    },
    {
      "episode": 15248,
      "epoch": 0.09135900109046027,
      "loss/policy_avg": 0.7638048529624939,
      "lr": 9.39161554192229e-06,
      "objective/entropy": -158.99050903320312,
      "objective/kl": 39.69103240966797,
      "objective/non_score_reward": -1.9845517873764038,
      "objective/rlhf_reward": -5.815500917212043,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 11.06544303894043,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.541015625,
      "step": 952,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9981579780578613
    },
    {
      "episode": 15264,
      "epoch": 0.09145486572958982,
      "loss/policy_avg": 0.764492392539978,
      "lr": 9.390976482617587e-06,
      "objective/entropy": -159.26947021484375,
      "objective/kl": 28.415475845336914,
      "objective/non_score_reward": -1.4207737445831299,
      "objective/rlhf_reward": -5.683095276355743,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 7.907594680786133,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.6328125,
      "step": 953,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.997883677482605
    },
    {
      "episode": 15280,
      "epoch": 0.09155073036871937,
      "loss/policy_avg": 0.3368009328842163,
      "lr": 9.390337423312884e-06,
      "objective/entropy": -173.85415649414062,
      "objective/kl": 35.513309478759766,
      "objective/non_score_reward": -1.775665521621704,
      "objective/rlhf_reward": -5.49854234224947,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 6.337751388549805,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.541015625,
      "step": 954,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9994667768478394
    },
    {
      "episode": 15296,
      "epoch": 0.09164659500784891,
      "loss/policy_avg": 0.0456845797598362,
      "lr": 9.389698364008181e-06,
      "objective/entropy": 16.725250244140625,
      "objective/kl": 36.44686508178711,
      "objective/non_score_reward": -1.822343349456787,
      "objective/rlhf_reward": -5.865541179378596,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 21.832763671875,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.548828125,
      "step": 955,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 1.999651551246643
    },
    {
      "episode": 15312,
      "epoch": 0.09174245964697847,
      "loss/policy_avg": 0.0268879272043705,
      "lr": 9.389059304703478e-06,
      "objective/entropy": -219.0832977294922,
      "objective/kl": 25.021286010742188,
      "objective/non_score_reward": -1.2510643005371094,
      "objective/rlhf_reward": -3.1794285133209934,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 13.525361061096191,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.673828125,
      "step": 956,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0001580715179443
    },
    {
      "episode": 15328,
      "epoch": 0.09183832428610801,
      "loss/policy_avg": 0.25198429822921753,
      "lr": 9.388420245398773e-06,
      "objective/entropy": -216.4515838623047,
      "objective/kl": 29.98337173461914,
      "objective/non_score_reward": -1.4991683959960938,
      "objective/rlhf_reward": -3.0729548081171245,
      "objective/scores": 0.7309297535714575,
      "policy/approxkl_avg": 8.199630737304688,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.634765625,
      "step": 957,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9965362548828125
    },
    {
      "episode": 15344,
      "epoch": 0.09193418892523757,
      "loss/policy_avg": 0.035516731441020966,
      "lr": 9.38778118609407e-06,
      "objective/entropy": -250.8704833984375,
      "objective/kl": 30.556961059570312,
      "objective/non_score_reward": -1.5278480052947998,
      "objective/rlhf_reward": -4.73278991231094,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 3.100607395172119,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.607421875,
      "step": 958,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0005221366882324
    },
    {
      "episode": 15360,
      "epoch": 0.09203005356436711,
      "loss/policy_avg": 0.6594608426094055,
      "lr": 9.387142126789367e-06,
      "objective/entropy": -190.2021942138672,
      "objective/kl": 29.693756103515625,
      "objective/non_score_reward": -1.4846878051757812,
      "objective/rlhf_reward": -4.38249173661764,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 11.999906539916992,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.55859375,
      "step": 959,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9964005947113037
    },
    {
      "episode": 15376,
      "epoch": 0.09212591820349667,
      "loss/policy_avg": 0.16847842931747437,
      "lr": 9.386503067484664e-06,
      "objective/entropy": -220.72311401367188,
      "objective/kl": 22.618806838989258,
      "objective/non_score_reward": -1.1309404373168945,
      "objective/rlhf_reward": -3.0731633110955805,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 1.5775080919265747,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.63671875,
      "step": 960,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0047149658203125
    },
    {
      "episode": 15392,
      "epoch": 0.09222178284262621,
      "loss/policy_avg": 0.37361010909080505,
      "lr": 9.38586400817996e-06,
      "objective/entropy": -219.60760498046875,
      "objective/kl": 31.668062210083008,
      "objective/non_score_reward": -1.58340322971344,
      "objective/rlhf_reward": -4.852660181935191,
      "objective/scores": 0.3702381544273198,
      "policy/approxkl_avg": 6.965027809143066,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.615234375,
      "step": 961,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 9,
      "val/ratio": 1.9987406730651855
    },
    {
      "episode": 15408,
      "epoch": 0.09231764748175576,
      "loss/policy_avg": 0.3272181749343872,
      "lr": 9.385224948875256e-06,
      "objective/entropy": -200.26370239257812,
      "objective/kl": 38.33747100830078,
      "objective/non_score_reward": -1.916873574256897,
      "objective/rlhf_reward": -5.720082710461552,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 2.9499969482421875,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.583984375,
      "step": 962,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9983041286468506
    },
    {
      "episode": 15424,
      "epoch": 0.0924135121208853,
      "loss/policy_avg": 0.02453005313873291,
      "lr": 9.384585889570553e-06,
      "objective/entropy": -259.0159606933594,
      "objective/kl": 32.376686096191406,
      "objective/non_score_reward": -1.6188342571258545,
      "objective/rlhf_reward": -5.051504810054866,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 11.491250038146973,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.634765625,
      "step": 963,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.0016989707946777
    },
    {
      "episode": 15440,
      "epoch": 0.09250937676001486,
      "loss/policy_avg": -0.1082817018032074,
      "lr": 9.38394683026585e-06,
      "objective/entropy": -136.52200317382812,
      "objective/kl": 34.37030792236328,
      "objective/non_score_reward": -1.718515396118164,
      "objective/rlhf_reward": -5.212202077329742,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 5.610563278198242,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.65234375,
      "step": 964,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9993445873260498
    },
    {
      "episode": 15456,
      "epoch": 0.0926052413991444,
      "loss/policy_avg": 0.3635658025741577,
      "lr": 9.383307770961147e-06,
      "objective/entropy": -242.04705810546875,
      "objective/kl": 26.167871475219727,
      "objective/non_score_reward": -1.3083934783935547,
      "objective/rlhf_reward": -3.8335740923881527,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 10.497917175292969,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.505859375,
      "step": 965,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.998986840248108
    },
    {
      "episode": 15472,
      "epoch": 0.09270110603827396,
      "loss/policy_avg": 0.4805383086204529,
      "lr": 9.382668711656443e-06,
      "objective/entropy": -130.80931091308594,
      "objective/kl": 43.840057373046875,
      "objective/non_score_reward": -2.192002773284912,
      "objective/rlhf_reward": -6.368011450767517,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 1.2675271034240723,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.619140625,
      "step": 966,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.001443862915039
    },
    {
      "episode": 15488,
      "epoch": 0.0927969706774035,
      "loss/policy_avg": 0.9434456825256348,
      "lr": 9.382029652351739e-06,
      "objective/entropy": -116.85310363769531,
      "objective/kl": 55.79869842529297,
      "objective/non_score_reward": -2.7899351119995117,
      "objective/rlhf_reward": -9.426406518618265,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 2.6991868019104004,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.791015625,
      "step": 967,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 2.00309157371521
    },
    {
      "episode": 15504,
      "epoch": 0.09289283531653306,
      "loss/policy_avg": 0.2830507755279541,
      "lr": 9.381390593047035e-06,
      "objective/entropy": -260.5260925292969,
      "objective/kl": 34.16276550292969,
      "objective/non_score_reward": -1.7081382274627686,
      "objective/rlhf_reward": -5.381954531283721,
      "objective/scores": 0.36264953503719355,
      "policy/approxkl_avg": 4.792706489562988,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.615234375,
      "step": 968,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9985530376434326
    },
    {
      "episode": 15520,
      "epoch": 0.0929886999556626,
      "loss/policy_avg": 0.19756931066513062,
      "lr": 9.380751533742332e-06,
      "objective/entropy": -234.741455078125,
      "objective/kl": 25.891204833984375,
      "objective/non_score_reward": -1.2945603132247925,
      "objective/rlhf_reward": -3.055534782187019,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 2.262695789337158,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.640625,
      "step": 969,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0007271766662598
    },
    {
      "episode": 15536,
      "epoch": 0.09308456459479215,
      "loss/policy_avg": 0.0513734444975853,
      "lr": 9.380112474437628e-06,
      "objective/entropy": -195.60171508789062,
      "objective/kl": 35.50217819213867,
      "objective/non_score_reward": -1.775109052658081,
      "objective/rlhf_reward": -5.741186105941219,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 1.6989755630493164,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.607421875,
      "step": 970,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 2.0012567043304443
    },
    {
      "episode": 15552,
      "epoch": 0.0931804292339217,
      "loss/policy_avg": 0.1513216644525528,
      "lr": 9.379473415132924e-06,
      "objective/entropy": -245.57977294921875,
      "objective/kl": 23.89773941040039,
      "objective/non_score_reward": -1.1948869228363037,
      "objective/rlhf_reward": -4.7795480489730835,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 6.129580020904541,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.69921875,
      "step": 971,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 2.000563621520996
    },
    {
      "episode": 15568,
      "epoch": 0.09327629387305125,
      "loss/policy_avg": 0.041885554790496826,
      "lr": 9.378834355828221e-06,
      "objective/entropy": -261.82769775390625,
      "objective/kl": 24.18181037902832,
      "objective/non_score_reward": -1.2090904712677002,
      "objective/rlhf_reward": -3.457759955016476,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 9.62070369720459,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.533203125,
      "step": 972,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9969148635864258
    },
    {
      "episode": 15584,
      "epoch": 0.0933721585121808,
      "loss/policy_avg": 0.012015002779662609,
      "lr": 9.378195296523518e-06,
      "objective/entropy": -251.767333984375,
      "objective/kl": 27.563173294067383,
      "objective/non_score_reward": -1.378158688545227,
      "objective/rlhf_reward": -3.908514711920338,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 1.0967427492141724,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.634765625,
      "step": 973,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0006260871887207
    },
    {
      "episode": 15600,
      "epoch": 0.09346802315131035,
      "loss/policy_avg": -0.31819072365760803,
      "lr": 9.377556237218815e-06,
      "objective/entropy": -175.70556640625,
      "objective/kl": 28.285152435302734,
      "objective/non_score_reward": -1.4142576456069946,
      "objective/rlhf_reward": -4.052910540167408,
      "objective/scores": 0.40102999566398123,
      "policy/approxkl_avg": 2.37001371383667,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.505859375,
      "step": 974,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9995312690734863
    },
    {
      "episode": 15616,
      "epoch": 0.09356388779043989,
      "loss/policy_avg": 0.6060304641723633,
      "lr": 9.37691717791411e-06,
      "objective/entropy": -34.974281311035156,
      "objective/kl": 35.56610107421875,
      "objective/non_score_reward": -1.7783050537109375,
      "objective/rlhf_reward": -5.59744867065781,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 6.845120906829834,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.853515625,
      "step": 975,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9995267391204834
    },
    {
      "episode": 15632,
      "epoch": 0.09365975242956945,
      "loss/policy_avg": 0.1691616326570511,
      "lr": 9.376278118609407e-06,
      "objective/entropy": -173.51535034179688,
      "objective/kl": 40.181976318359375,
      "objective/non_score_reward": -2.009099006652832,
      "objective/rlhf_reward": -6.657793619719845,
      "objective/scores": 0.34465054211822604,
      "policy/approxkl_avg": 0.46673262119293213,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.71875,
      "step": 976,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.0020508766174316
    },
    {
      "episode": 15648,
      "epoch": 0.09375561706869899,
      "loss/policy_avg": 0.12263473123311996,
      "lr": 9.375639059304704e-06,
      "objective/entropy": -244.26974487304688,
      "objective/kl": 29.573442459106445,
      "objective/non_score_reward": -1.4786722660064697,
      "objective/rlhf_reward": -4.358429758754328,
      "objective/scores": 0.38906482631788786,
      "policy/approxkl_avg": 3.748386859893799,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.693359375,
      "step": 977,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9990627765655518
    },
    {
      "episode": 15664,
      "epoch": 0.09385148170782855,
      "loss/policy_avg": 1.4557695388793945,
      "lr": 9.375000000000001e-06,
      "objective/entropy": -133.55853271484375,
      "objective/kl": 45.2318229675293,
      "objective/non_score_reward": -2.2615909576416016,
      "objective/rlhf_reward": -7.530592167171177,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 4.7986626625061035,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.654296875,
      "step": 978,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9999971389770508
    },
    {
      "episode": 15680,
      "epoch": 0.09394734634695809,
      "loss/policy_avg": 0.04724450409412384,
      "lr": 9.374360940695298e-06,
      "objective/entropy": -291.25103759765625,
      "objective/kl": 28.29153823852539,
      "objective/non_score_reward": -1.4145770072937012,
      "objective/rlhf_reward": -3.710896800236638,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 2.313387393951416,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.625,
      "step": 979,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9993375539779663
    },
    {
      "episode": 15696,
      "epoch": 0.09404321098608764,
      "loss/policy_avg": 0.2293320745229721,
      "lr": 9.373721881390595e-06,
      "objective/entropy": -136.44857788085938,
      "objective/kl": 38.36898422241211,
      "objective/non_score_reward": -1.9184492826461792,
      "objective/rlhf_reward": -5.551090779081855,
      "objective/scores": 0.5306765580733931,
      "policy/approxkl_avg": 2.303453207015991,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.61328125,
      "step": 980,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.999606966972351
    },
    {
      "episode": 15712,
      "epoch": 0.0941390756252172,
      "loss/policy_avg": 0.16989938914775848,
      "lr": 9.37308282208589e-06,
      "objective/entropy": -171.79864501953125,
      "objective/kl": 32.806495666503906,
      "objective/non_score_reward": -1.640324592590332,
      "objective/rlhf_reward": -4.613887022213872,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 8.31067180633545,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.603515625,
      "step": 981,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9984290599822998
    },
    {
      "episode": 15728,
      "epoch": 0.09423494026434674,
      "loss/policy_avg": 0.7234645485877991,
      "lr": 9.372443762781187e-06,
      "objective/entropy": -219.93374633789062,
      "objective/kl": 26.91738510131836,
      "objective/non_score_reward": -1.3458693027496338,
      "objective/rlhf_reward": -0.9834773302078244,
      "objective/scores": 1.1,
      "policy/approxkl_avg": 1.4521507024765015,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.646484375,
      "step": 982,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.003150463104248
    },
    {
      "episode": 15744,
      "epoch": 0.0943308049034763,
      "loss/policy_avg": 0.48133015632629395,
      "lr": 9.371804703476484e-06,
      "objective/entropy": -282.47552490234375,
      "objective/kl": 39.29179763793945,
      "objective/non_score_reward": -1.9645898342132568,
      "objective/rlhf_reward": -6.125026241938272,
      "objective/scores": 0.43333333333333335,
      "policy/approxkl_avg": 6.169063568115234,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.720703125,
      "step": 983,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.997374415397644
    },
    {
      "episode": 15760,
      "epoch": 0.09442666954260584,
      "loss/policy_avg": 0.1187177523970604,
      "lr": 9.37116564417178e-06,
      "objective/entropy": -158.33642578125,
      "objective/kl": 40.20547103881836,
      "objective/non_score_reward": -2.0102736949920654,
      "objective/rlhf_reward": -6.69945864966455,
      "objective/scores": 0.33540891336663825,
      "policy/approxkl_avg": 3.5165886878967285,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.62109375,
      "step": 984,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9981753826141357
    },
    {
      "episode": 15776,
      "epoch": 0.0945225341817354,
      "loss/policy_avg": 0.16677279770374298,
      "lr": 9.370526584867077e-06,
      "objective/entropy": -162.21728515625,
      "objective/kl": 33.61964797973633,
      "objective/non_score_reward": -1.6809823513031006,
      "objective/rlhf_reward": -5.323929286003112,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 5.913999557495117,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.734375,
      "step": 985,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9968831539154053
    },
    {
      "episode": 15792,
      "epoch": 0.09461839882086494,
      "loss/policy_avg": 0.22338780760765076,
      "lr": 9.369887525562373e-06,
      "objective/entropy": -191.39588928222656,
      "objective/kl": 50.39151382446289,
      "objective/non_score_reward": -2.519575595855713,
      "objective/rlhf_reward": -8.416443472326385,
      "objective/scores": 0.41546487678572874,
      "policy/approxkl_avg": 45.444732666015625,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.755859375,
      "step": 986,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.998542308807373
    },
    {
      "episode": 15808,
      "epoch": 0.0947142634599945,
      "loss/policy_avg": 0.37791919708251953,
      "lr": 9.36924846625767e-06,
      "objective/entropy": -270.806396484375,
      "objective/kl": 29.205078125,
      "objective/non_score_reward": -1.4602539539337158,
      "objective/rlhf_reward": -5.841015696525574,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 8.895004272460938,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.6875,
      "step": 987,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.9979805946350098
    },
    {
      "episode": 15824,
      "epoch": 0.09481012809912404,
      "loss/policy_avg": 0.7314577102661133,
      "lr": 9.368609406952966e-06,
      "objective/entropy": -174.33633422851562,
      "objective/kl": 41.00555419921875,
      "objective/non_score_reward": -2.0502774715423584,
      "objective/rlhf_reward": -6.77727790613946,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 4.151052474975586,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.59375,
      "step": 988,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 1.998262882232666
    },
    {
      "episode": 15840,
      "epoch": 0.09490599273825359,
      "loss/policy_avg": 0.1200692355632782,
      "lr": 9.367970347648263e-06,
      "objective/entropy": -259.9232177734375,
      "objective/kl": 32.56160354614258,
      "objective/non_score_reward": -1.628080129623413,
      "objective/rlhf_reward": -5.112320518493652,
      "objective/scores": 0.35,
      "policy/approxkl_avg": 3.3896703720092773,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.615234375,
      "step": 989,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.0001492500305176
    },
    {
      "episode": 15856,
      "epoch": 0.09500185737738313,
      "loss/policy_avg": 0.7871278524398804,
      "lr": 9.367331288343558e-06,
      "objective/entropy": -162.90664672851562,
      "objective/kl": 37.55353927612305,
      "objective/non_score_reward": -1.8776767253875732,
      "objective/rlhf_reward": -6.086875279148188,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 24.93891716003418,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.7890625,
      "step": 990,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9959328174591064
    },
    {
      "episode": 15872,
      "epoch": 0.09509772201651269,
      "loss/policy_avg": -0.12516134977340698,
      "lr": 9.366692229038855e-06,
      "objective/entropy": -238.83116149902344,
      "objective/kl": 37.03616714477539,
      "objective/non_score_reward": -1.8518084287643433,
      "objective/rlhf_reward": -6.047983967994137,
      "objective/scores": 0.33981246656813147,
      "policy/approxkl_avg": 15.576482772827148,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.681640625,
      "step": 991,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.9985809326171875
    },
    {
      "episode": 15888,
      "epoch": 0.09519358665564223,
      "loss/policy_avg": -0.04968651384115219,
      "lr": 9.366053169734152e-06,
      "objective/entropy": -183.43231201171875,
      "objective/kl": 35.40851593017578,
      "objective/non_score_reward": -1.77042555809021,
      "objective/rlhf_reward": -5.756189737349672,
      "objective/scores": 0.3313782131597591,
      "policy/approxkl_avg": 0.5774535536766052,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.583984375,
      "step": 992,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 5,
      "val/ratio": 2.002680778503418
    },
    {
      "episode": 15904,
      "epoch": 0.09528945129477179,
      "loss/policy_avg": 0.009859908372163773,
      "lr": 9.365414110429449e-06,
      "objective/entropy": -14.670166015625,
      "objective/kl": 53.70581817626953,
      "objective/non_score_reward": -2.685290813446045,
      "objective/rlhf_reward": -8.3411630153656,
      "objective/scores": 0.6,
      "policy/approxkl_avg": 1.3184102773666382,
      "policy/clipfrac_avg": 1.0,
      "policy/entropy_avg": 0.681640625,
      "step": 993,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 3,
      "val/ratio": 2.0005533695220947
    },
    {
      "episode": 15920,
      "epoch": 0.09538531593390133,
      "loss/policy_avg": 0.3695295453071594,
      "lr": 9.364775051124744e-06,
      "objective/entropy": -288.468505859375,
      "objective/kl": 32.96984100341797,
      "objective/non_score_reward": -1.6484923362731934,
      "objective/rlhf_reward": -5.0781975624882545,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 3.1653892993927,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.642578125,
      "step": 994,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.999734878540039
    },
    {
      "episode": 15936,
      "epoch": 0.09548118057303089,
      "loss/policy_avg": 0.3992432951927185,
      "lr": 9.364135991820041e-06,
      "objective/entropy": -231.646728515625,
      "objective/kl": 34.67195510864258,
      "objective/non_score_reward": -1.733597755432129,
      "objective/rlhf_reward": -5.510559280117121,
      "objective/scores": 0.35595802480981553,
      "policy/approxkl_avg": 19.767539978027344,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.849609375,
      "step": 995,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.9987037181854248
    },
    {
      "episode": 15952,
      "epoch": 0.09557704521216043,
      "loss/policy_avg": 0.03356311097741127,
      "lr": 9.363496932515338e-06,
      "objective/entropy": -210.72410583496094,
      "objective/kl": 27.1010799407959,
      "objective/non_score_reward": -1.3550540208816528,
      "objective/rlhf_reward": -3.595387215885233,
      "objective/scores": 0.4562071871080222,
      "policy/approxkl_avg": 1.0958271026611328,
      "policy/clipfrac_avg": 1.25,
      "policy/entropy_avg": 0.7265625,
      "step": 996,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.9994826316833496
    },
    {
      "episode": 15968,
      "epoch": 0.09567290985128998,
      "loss/policy_avg": 1.1218140125274658,
      "lr": 9.362857873210635e-06,
      "objective/entropy": -71.63316345214844,
      "objective/kl": 40.19666290283203,
      "objective/non_score_reward": -2.009833335876465,
      "objective/rlhf_reward": -8.03933310508728,
      "objective/scores": 0.0,
      "policy/approxkl_avg": 3.4838500022888184,
      "policy/clipfrac_avg": 0.75,
      "policy/entropy_avg": 0.66015625,
      "step": 997,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 4,
      "val/ratio": 1.9991953372955322
    },
    {
      "episode": 15984,
      "epoch": 0.09576877449041953,
      "loss/policy_avg": 0.23440885543823242,
      "lr": 9.362218813905932e-06,
      "objective/entropy": -217.69229125976562,
      "objective/kl": 26.445728302001953,
      "objective/non_score_reward": -1.3222863674163818,
      "objective/rlhf_reward": -3.773374044688877,
      "objective/scores": 0.37894294565112985,
      "policy/approxkl_avg": 11.445338249206543,
      "policy/clipfrac_avg": 0.5,
      "policy/entropy_avg": 0.623046875,
      "step": 998,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 9,
      "val/ratio": 1.9970027208328247
    },
    {
      "episode": 16000,
      "epoch": 0.09586463912954908,
      "loss/policy_avg": -0.3169388175010681,
      "lr": 9.361579754601227e-06,
      "objective/entropy": -116.28077697753906,
      "objective/kl": 44.722564697265625,
      "objective/non_score_reward": -2.236128091812134,
      "objective/rlhf_reward": -6.997101019101079,
      "objective/scores": 0.4868528072345416,
      "policy/approxkl_avg": 4.412589073181152,
      "policy/clipfrac_avg": 1.5,
      "policy/entropy_avg": 0.734375,
      "step": 999,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 2.000357151031494
    }
  ],
  "logging_steps": 500,
  "max_steps": 7824,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3.0,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": true,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0,
  "train_batch_size": null,
  "trial_name": null,
  "trial_params": null
}