diff --git "a/checkpoint-600/trainer_state.json" "b/checkpoint-600/trainer_state.json" deleted file mode 100644--- "a/checkpoint-600/trainer_state.json" +++ /dev/null @@ -1,10834 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "episode": 9600, - "epoch": 0.05751878347772944, - "eval_steps": 500, - "global_step": 600, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "episode": 16, - "epoch": 9.586463912954908e-05, - "loss/policy_avg": 0.0339290015399456, - "lr": 1e-05, - "objective/entropy": 78.48619842529297, - "objective/kl": 5.6675214767456055, - "objective/non_score_reward": -0.2833760380744934, - "objective/rlhf_reward": 3.2664958328008655, - "objective/scores": 1.1, - "policy/approxkl_avg": 56.270538330078125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5703125, - "step": 0, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000828266143799 - }, - { - "episode": 32, - "epoch": 0.00019172927825909816, - "loss/policy_avg": 0.032509539276361465, - "lr": 9.999360940695298e-06, - "objective/entropy": 39.34157943725586, - "objective/kl": 8.134885787963867, - "objective/non_score_reward": -0.40674424171447754, - "objective/rlhf_reward": -1.6269769463688135, - "objective/scores": 0.0, - "policy/approxkl_avg": 125.53129577636719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.46875, - "step": 1, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994192123413086 - }, - { - "episode": 48, - "epoch": 0.00028759391738864725, - "loss/policy_avg": 0.2574540972709656, - "lr": 9.998721881390595e-06, - "objective/entropy": 35.90438461303711, - "objective/kl": 10.056818008422852, - "objective/non_score_reward": -0.5028409957885742, - "objective/rlhf_reward": -4.011363983154297, - "objective/scores": -0.5, - "policy/approxkl_avg": 197.81790161132812, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.73828125, - "step": 2, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99604332447052 - }, - { - "episode": 64, - "epoch": 0.0003834585565181963, - "loss/policy_avg": 0.1315518617630005, - "lr": 9.99808282208589e-06, - "objective/entropy": 163.52642822265625, - "objective/kl": 12.497467041015625, - "objective/non_score_reward": -0.6248733997344971, - "objective/rlhf_reward": -2.499493680894375, - "objective/scores": 0.0, - "policy/approxkl_avg": 280.7725830078125, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.716796875, - "step": 3, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999171257019043 - }, - { - "episode": 80, - "epoch": 0.0004793231956477454, - "loss/policy_avg": 0.024046147242188454, - "lr": 9.997443762781187e-06, - "objective/entropy": 118.5094223022461, - "objective/kl": 5.982309818267822, - "objective/non_score_reward": -0.29911553859710693, - "objective/rlhf_reward": -1.196462158113718, - "objective/scores": 0.0, - "policy/approxkl_avg": 52.543487548828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.552734375, - "step": 4, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001847743988037 - }, - { - "episode": 96, - "epoch": 0.0005751878347772945, - "loss/policy_avg": 0.10632362961769104, - "lr": 9.996804703476484e-06, - "objective/entropy": 152.1885986328125, - "objective/kl": 7.815367698669434, - "objective/non_score_reward": -0.3907684087753296, - "objective/rlhf_reward": 0.0987858943933384, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 88.51527404785156, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5859375, - "step": 5, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992973804473877 - }, - { - "episode": 112, - "epoch": 0.0006710524739068436, - "loss/policy_avg": 0.08422186970710754, - "lr": 9.99616564417178e-06, - "objective/entropy": 75.0154037475586, - "objective/kl": 17.52770233154297, - "objective/non_score_reward": -0.8763852119445801, - "objective/rlhf_reward": -3.5055407360196114, - "objective/scores": 0.0, - "policy/approxkl_avg": 443.602294921875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.505859375, - "step": 6, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9972598552703857 - }, - { - "episode": 128, - "epoch": 0.0007669171130363926, - "loss/policy_avg": 0.007405903190374374, - "lr": 9.995526584867077e-06, - "objective/entropy": 51.67172622680664, - "objective/kl": 11.563663482666016, - "objective/non_score_reward": -0.5781831741333008, - "objective/rlhf_reward": -4.312732696533203, - "objective/scores": -0.5, - "policy/approxkl_avg": 126.90079498291016, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.546875, - "step": 7, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0016322135925293 - }, - { - "episode": 144, - "epoch": 0.0008627817521659417, - "loss/policy_avg": 0.13771404325962067, - "lr": 9.994887525562374e-06, - "objective/entropy": 240.35464477539062, - "objective/kl": 18.096904754638672, - "objective/non_score_reward": -0.9048453569412231, - "objective/rlhf_reward": -1.6719702733325317, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 460.8926696777344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.90625, - "step": 8, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999654293060303 - }, - { - "episode": 160, - "epoch": 0.0009586463912954908, - "loss/policy_avg": 0.41069674491882324, - "lr": 9.99424846625767e-06, - "objective/entropy": 224.78262329101562, - "objective/kl": 11.231921195983887, - "objective/non_score_reward": -0.5615960955619812, - "objective/rlhf_reward": -0.8225522383051791, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 167.4181671142578, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7578125, - "step": 9, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9952213764190674 - }, - { - "episode": 176, - "epoch": 0.0010545110304250398, - "loss/policy_avg": 0.2340843677520752, - "lr": 9.993609406952966e-06, - "objective/entropy": 77.48204040527344, - "objective/kl": 13.726895332336426, - "objective/non_score_reward": -0.6863448619842529, - "objective/rlhf_reward": -0.34537934362888345, - "objective/scores": 0.6, - "policy/approxkl_avg": 270.8516845703125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.876953125, - "step": 10, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9955778121948242 - }, - { - "episode": 192, - "epoch": 0.001150375669554589, - "loss/policy_avg": 0.1845349222421646, - "lr": 9.992970347648263e-06, - "objective/entropy": -45.138362884521484, - "objective/kl": 14.76271915435791, - "objective/non_score_reward": -0.7381359338760376, - "objective/rlhf_reward": -1.2192103425661722, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 207.85874938964844, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.703125, - "step": 11, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000040292739868 - }, - { - "episode": 208, - "epoch": 0.001246240308684138, - "loss/policy_avg": 0.6059431433677673, - "lr": 9.992331288343558e-06, - "objective/entropy": 40.190372467041016, - "objective/kl": 19.720378875732422, - "objective/non_score_reward": -0.9860190749168396, - "objective/rlhf_reward": -1.821370030120883, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 268.6492919921875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.654296875, - "step": 12, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9966726303100586 - }, - { - "episode": 224, - "epoch": 0.0013421049478136871, - "loss/policy_avg": -0.0064672790467739105, - "lr": 9.991692229038855e-06, - "objective/entropy": 108.48332214355469, - "objective/kl": 5.689068794250488, - "objective/non_score_reward": -0.28445348143577576, - "objective/rlhf_reward": -1.1378139406442642, - "objective/scores": 0.0, - "policy/approxkl_avg": 15.834894180297852, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4716796875, - "step": 13, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0026183128356934 - }, - { - "episode": 240, - "epoch": 0.001437969586943236, - "loss/policy_avg": 0.6670212745666504, - "lr": 9.991053169734152e-06, - "objective/entropy": 0.18174362182617188, - "objective/kl": 12.982845306396484, - "objective/non_score_reward": -0.6491422653198242, - "objective/rlhf_reward": -2.596569076180458, - "objective/scores": 0.0, - "policy/approxkl_avg": 330.118896484375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.759765625, - "step": 14, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9997687339782715 - }, - { - "episode": 256, - "epoch": 0.0015338342260727853, - "loss/policy_avg": 0.21263472735881805, - "lr": 9.990414110429449e-06, - "objective/entropy": 249.88232421875, - "objective/kl": 9.040252685546875, - "objective/non_score_reward": -0.45201271772384644, - "objective/rlhf_reward": -1.808050960302353, - "objective/scores": 0.0, - "policy/approxkl_avg": 102.57914733886719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.744140625, - "step": 15, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000203847885132 - }, - { - "episode": 272, - "epoch": 0.0016296988652023342, - "loss/policy_avg": 0.01660698838531971, - "lr": 9.989775051124744e-06, - "objective/entropy": 106.64703369140625, - "objective/kl": 11.038640022277832, - "objective/non_score_reward": -0.5519319772720337, - "objective/rlhf_reward": -2.2077280431985855, - "objective/scores": 0.0, - "policy/approxkl_avg": 164.40890502929688, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.654296875, - "step": 16, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000194549560547 - }, - { - "episode": 288, - "epoch": 0.0017255635043318834, - "loss/policy_avg": 0.17964985966682434, - "lr": 9.989135991820041e-06, - "objective/entropy": 29.59412956237793, - "objective/kl": 11.429637908935547, - "objective/non_score_reward": -0.5714819431304932, - "objective/rlhf_reward": -2.2859277576208115, - "objective/scores": 0.0, - "policy/approxkl_avg": 113.22151184082031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.642578125, - "step": 17, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9986356496810913 - }, - { - "episode": 304, - "epoch": 0.0018214281434614326, - "loss/policy_avg": 0.1845007836818695, - "lr": 9.988496932515338e-06, - "objective/entropy": -2.3180160522460938, - "objective/kl": 15.66268539428711, - "objective/non_score_reward": -0.7831343412399292, - "objective/rlhf_reward": -1.773287498687191, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 162.00823974609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8046875, - "step": 18, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0002946853637695 - }, - { - "episode": 320, - "epoch": 0.0019172927825909815, - "loss/policy_avg": 0.14623276889324188, - "lr": 9.987857873210635e-06, - "objective/entropy": 26.79373550415039, - "objective/kl": 16.199674606323242, - "objective/non_score_reward": -0.8099837303161621, - "objective/rlhf_reward": -3.2399348318576813, - "objective/scores": 0.0, - "policy/approxkl_avg": 345.71685791015625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7109375, - "step": 19, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992010593414307 - }, - { - "episode": 336, - "epoch": 0.0020131574217205307, - "loss/policy_avg": 0.061316944658756256, - "lr": 9.987218813905932e-06, - "objective/entropy": 30.27604866027832, - "objective/kl": 13.349930763244629, - "objective/non_score_reward": -0.6674965620040894, - "objective/rlhf_reward": -4.669986248016357, - "objective/scores": -0.5, - "policy/approxkl_avg": 182.816650390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.56640625, - "step": 20, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.996535301208496 - }, - { - "episode": 352, - "epoch": 0.0021090220608500796, - "loss/policy_avg": -0.08272892981767654, - "lr": 9.986579754601228e-06, - "objective/entropy": 198.63003540039062, - "objective/kl": 11.5382719039917, - "objective/non_score_reward": -0.5769136548042297, - "objective/rlhf_reward": -0.9076545149087907, - "objective/scores": 0.35, - "policy/approxkl_avg": 159.97686767578125, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.974609375, - "step": 21, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998477578163147 - }, - { - "episode": 368, - "epoch": 0.0022048866999796286, - "loss/policy_avg": 0.23813551664352417, - "lr": 9.985940695296524e-06, - "objective/entropy": 181.51829528808594, - "objective/kl": 13.08276653289795, - "objective/non_score_reward": -0.6541383266448975, - "objective/rlhf_reward": -4.61655330657959, - "objective/scores": -0.5, - "policy/approxkl_avg": 108.47281646728516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6484375, - "step": 22, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9991295337677002 - }, - { - "episode": 384, - "epoch": 0.002300751339109178, - "loss/policy_avg": 0.36420387029647827, - "lr": 9.98530163599182e-06, - "objective/entropy": 257.93609619140625, - "objective/kl": 14.696407318115234, - "objective/non_score_reward": -0.7348203063011169, - "objective/rlhf_reward": -1.3351611531415755, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 150.4597625732422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.849609375, - "step": 23, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9980016946792603 - }, - { - "episode": 400, - "epoch": 0.002396615978238727, - "loss/policy_avg": 0.20780539512634277, - "lr": 9.984662576687117e-06, - "objective/entropy": -139.27951049804688, - "objective/kl": 15.462644577026367, - "objective/non_score_reward": -0.77313232421875, - "objective/rlhf_reward": -5.092529296875, - "objective/scores": -0.5, - "policy/approxkl_avg": 237.78317260742188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484375, - "step": 24, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999894618988037 - }, - { - "episode": 416, - "epoch": 0.002492480617368276, - "loss/policy_avg": 0.2547074556350708, - "lr": 9.984023517382414e-06, - "objective/entropy": 103.24639892578125, - "objective/kl": 17.307334899902344, - "objective/non_score_reward": -0.8653668165206909, - "objective/rlhf_reward": -5.461467266082764, - "objective/scores": -0.5, - "policy/approxkl_avg": 167.2418212890625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65625, - "step": 25, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000932216644287 - }, - { - "episode": 432, - "epoch": 0.002588345256497825, - "loss/policy_avg": 0.3379603624343872, - "lr": 9.983384458077711e-06, - "objective/entropy": 120.86388397216797, - "objective/kl": 14.275808334350586, - "objective/non_score_reward": -0.7137903571128845, - "objective/rlhf_reward": -2.8551614582538605, - "objective/scores": 0.0, - "policy/approxkl_avg": 99.34181213378906, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.73828125, - "step": 26, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994611740112305 - }, - { - "episode": 448, - "epoch": 0.0026842098956273742, - "loss/policy_avg": 0.1328231394290924, - "lr": 9.982745398773006e-06, - "objective/entropy": 154.86619567871094, - "objective/kl": 14.35202693939209, - "objective/non_score_reward": -0.7176014184951782, - "objective/rlhf_reward": -4.870405673980713, - "objective/scores": -0.5, - "policy/approxkl_avg": 91.33482360839844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.724609375, - "step": 27, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9957685470581055 - }, - { - "episode": 464, - "epoch": 0.002780074534756923, - "loss/policy_avg": 0.2314174473285675, - "lr": 9.982106339468303e-06, - "objective/entropy": 60.19127655029297, - "objective/kl": 12.188166618347168, - "objective/non_score_reward": -0.6094082593917847, - "objective/rlhf_reward": -2.4376331865787506, - "objective/scores": 0.0, - "policy/approxkl_avg": 205.1094970703125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.810546875, - "step": 28, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.994727611541748 - }, - { - "episode": 480, - "epoch": 0.002875939173886472, - "loss/policy_avg": 0.1261996328830719, - "lr": 9.9814672801636e-06, - "objective/entropy": 67.08200073242188, - "objective/kl": 16.607372283935547, - "objective/non_score_reward": -0.8303685784339905, - "objective/rlhf_reward": -3.3214742839336395, - "objective/scores": 0.0, - "policy/approxkl_avg": 226.6929168701172, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.53125, - "step": 29, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9976527690887451 - }, - { - "episode": 496, - "epoch": 0.0029718038130160216, - "loss/policy_avg": 0.35239556431770325, - "lr": 9.980828220858897e-06, - "objective/entropy": 204.22116088867188, - "objective/kl": 14.144770622253418, - "objective/non_score_reward": -0.7072385549545288, - "objective/rlhf_reward": 1.5710457801818851, - "objective/scores": 1.1, - "policy/approxkl_avg": 124.6588363647461, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7578125, - "step": 30, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9977011680603027 - }, - { - "episode": 512, - "epoch": 0.0030676684521455705, - "loss/policy_avg": 0.26766547560691833, - "lr": 9.980189161554194e-06, - "objective/entropy": 107.69725036621094, - "objective/kl": 12.877479553222656, - "objective/non_score_reward": -0.643873929977417, - "objective/rlhf_reward": -2.5754958018660545, - "objective/scores": 0.0, - "policy/approxkl_avg": 186.40504455566406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.701171875, - "step": 31, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9972684383392334 - }, - { - "episode": 528, - "epoch": 0.0031635330912751195, - "loss/policy_avg": 0.017455143854022026, - "lr": 9.97955010224949e-06, - "objective/entropy": 69.81261444091797, - "objective/kl": 15.58060073852539, - "objective/non_score_reward": -0.7790300250053406, - "objective/rlhf_reward": -0.19240116024133824, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 326.45733642578125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.671875, - "step": 32, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994511604309082 - }, - { - "episode": 544, - "epoch": 0.0032593977304046684, - "loss/policy_avg": 0.1626880019903183, - "lr": 9.978911042944786e-06, - "objective/entropy": 49.244285583496094, - "objective/kl": 11.25068473815918, - "objective/non_score_reward": -0.5625342130661011, - "objective/rlhf_reward": -0.871534817901951, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 50.66204071044922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7109375, - "step": 33, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9966607093811035 - }, - { - "episode": 560, - "epoch": 0.003355262369534218, - "loss/policy_avg": 0.18031546473503113, - "lr": 9.978271983640083e-06, - "objective/entropy": 147.95474243164062, - "objective/kl": 15.950370788574219, - "objective/non_score_reward": -0.7975186109542847, - "objective/rlhf_reward": -3.190074533224106, - "objective/scores": 0.0, - "policy/approxkl_avg": 201.51080322265625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.515625, - "step": 34, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0006275177001953 - }, - { - "episode": 576, - "epoch": 0.0034511270086637668, - "loss/policy_avg": -0.05689749866724014, - "lr": 9.977632924335378e-06, - "objective/entropy": 4.240108489990234, - "objective/kl": 13.38272762298584, - "objective/non_score_reward": -0.6691364049911499, - "objective/rlhf_reward": -4.6765456199646, - "objective/scores": -0.5, - "policy/approxkl_avg": 247.30409240722656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.796875, - "step": 35, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9973469972610474 - }, - { - "episode": 592, - "epoch": 0.0035469916477933157, - "loss/policy_avg": 0.16461774706840515, - "lr": 9.976993865030675e-06, - "objective/entropy": 123.00151824951172, - "objective/kl": 11.21810531616211, - "objective/non_score_reward": -0.5609052181243896, - "objective/rlhf_reward": -4.243620872497559, - "objective/scores": -0.5, - "policy/approxkl_avg": 64.79019927978516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.712890625, - "step": 36, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0026960372924805 - }, - { - "episode": 608, - "epoch": 0.003642856286922865, - "loss/policy_avg": 0.15750062465667725, - "lr": 9.976354805725972e-06, - "objective/entropy": 30.60162925720215, - "objective/kl": 17.013538360595703, - "objective/non_score_reward": -0.850676953792572, - "objective/rlhf_reward": -3.4027078449726105, - "objective/scores": 0.0, - "policy/approxkl_avg": 234.95870971679688, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7421875, - "step": 37, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998824119567871 - }, - { - "episode": 624, - "epoch": 0.003738720926052414, - "loss/policy_avg": 0.03158241882920265, - "lr": 9.975715746421269e-06, - "objective/entropy": 115.71566772460938, - "objective/kl": 15.653677940368652, - "objective/non_score_reward": -0.7826838493347168, - "objective/rlhf_reward": -3.1307354420423508, - "objective/scores": 0.0, - "policy/approxkl_avg": 233.744873046875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.765625, - "step": 38, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997577428817749 - }, - { - "episode": 640, - "epoch": 0.003834585565181963, - "loss/policy_avg": -0.031586866825819016, - "lr": 9.975076687116566e-06, - "objective/entropy": 70.69473266601562, - "objective/kl": 13.526529312133789, - "objective/non_score_reward": -0.6763265132904053, - "objective/rlhf_reward": 1.6946939915418628, - "objective/scores": 1.1, - "policy/approxkl_avg": 101.47872924804688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.677734375, - "step": 39, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0043745040893555 - }, - { - "episode": 656, - "epoch": 0.003930450204311512, - "loss/policy_avg": 0.12032957375049591, - "lr": 9.97443762781186e-06, - "objective/entropy": 172.61669921875, - "objective/kl": 16.604652404785156, - "objective/non_score_reward": -0.8302326798439026, - "objective/rlhf_reward": -5.320930480957031, - "objective/scores": -0.5, - "policy/approxkl_avg": 220.98178100585938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.642578125, - "step": 40, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9957454204559326 - }, - { - "episode": 672, - "epoch": 0.004026314843441061, - "loss/policy_avg": 0.0876859575510025, - "lr": 9.973798568507158e-06, - "objective/entropy": 12.816411972045898, - "objective/kl": 13.908916473388672, - "objective/non_score_reward": -0.6954457759857178, - "objective/rlhf_reward": -4.781783103942871, - "objective/scores": -0.5, - "policy/approxkl_avg": 74.01300048828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.640625, - "step": 41, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0018563270568848 - }, - { - "episode": 688, - "epoch": 0.00412217948257061, - "loss/policy_avg": 0.1008758619427681, - "lr": 9.973159509202454e-06, - "objective/entropy": 257.1292724609375, - "objective/kl": 11.528783798217773, - "objective/non_score_reward": -0.5764391422271729, - "objective/rlhf_reward": -2.3057566583156586, - "objective/scores": 0.0, - "policy/approxkl_avg": 84.59580993652344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.75390625, - "step": 42, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9929132461547852 - }, - { - "episode": 704, - "epoch": 0.004218044121700159, - "loss/policy_avg": 0.44639891386032104, - "lr": 9.972520449897751e-06, - "objective/entropy": 62.445350646972656, - "objective/kl": 13.397602081298828, - "objective/non_score_reward": -0.6698801517486572, - "objective/rlhf_reward": -4.679520606994629, - "objective/scores": -0.5, - "policy/approxkl_avg": 185.67079162597656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.533203125, - "step": 43, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992148876190186 - }, - { - "episode": 720, - "epoch": 0.004313908760829708, - "loss/policy_avg": 0.09568839520215988, - "lr": 9.971881390593048e-06, - "objective/entropy": 129.84619140625, - "objective/kl": 14.350381851196289, - "objective/non_score_reward": -0.7175191640853882, - "objective/rlhf_reward": -0.9226653081940968, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 253.10037231445312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7421875, - "step": 44, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9957365989685059 - }, - { - "episode": 736, - "epoch": 0.004409773399959257, - "loss/policy_avg": 0.22084593772888184, - "lr": 9.971242331288345e-06, - "objective/entropy": -42.89992904663086, - "objective/kl": 15.893115997314453, - "objective/non_score_reward": -0.7946557998657227, - "objective/rlhf_reward": -1.44528977672259, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 121.7098388671875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.751953125, - "step": 45, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000582218170166 - }, - { - "episode": 752, - "epoch": 0.004505638039088807, - "loss/policy_avg": 0.22875869274139404, - "lr": 9.97060327198364e-06, - "objective/entropy": 9.025165557861328, - "objective/kl": 22.01996612548828, - "objective/non_score_reward": -1.1009982824325562, - "objective/rlhf_reward": -1.48027405583975, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 321.845703125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5859375, - "step": 46, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9953069686889648 - }, - { - "episode": 768, - "epoch": 0.004601502678218356, - "loss/policy_avg": 0.07661572843790054, - "lr": 9.969964212678937e-06, - "objective/entropy": 10.382087707519531, - "objective/kl": 13.358439445495605, - "objective/non_score_reward": -0.6679220199584961, - "objective/rlhf_reward": 1.7283119499683384, - "objective/scores": 1.1, - "policy/approxkl_avg": 29.50304412841797, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.638671875, - "step": 47, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999579668045044 - }, - { - "episode": 784, - "epoch": 0.004697367317347905, - "loss/policy_avg": 0.19636262953281403, - "lr": 9.969325153374234e-06, - "objective/entropy": 0.6832618713378906, - "objective/kl": 8.836541175842285, - "objective/non_score_reward": -0.4418269991874695, - "objective/rlhf_reward": -1.767308071255684, - "objective/scores": 0.0, - "policy/approxkl_avg": 94.3209228515625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.712890625, - "step": 48, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998420000076294 - }, - { - "episode": 800, - "epoch": 0.004793231956477454, - "loss/policy_avg": -0.02652953751385212, - "lr": 9.968686094069531e-06, - "objective/entropy": 125.6042709350586, - "objective/kl": 15.016199111938477, - "objective/non_score_reward": -0.7508100271224976, - "objective/rlhf_reward": -5.00324010848999, - "objective/scores": -0.5, - "policy/approxkl_avg": 207.12213134765625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7421875, - "step": 49, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.002742052078247 - }, - { - "episode": 816, - "epoch": 0.004889096595607003, - "loss/policy_avg": 0.10039197653532028, - "lr": 9.968047034764828e-06, - "objective/entropy": -24.506595611572266, - "objective/kl": 18.726213455200195, - "objective/non_score_reward": -0.9363107085227966, - "objective/rlhf_reward": -5.745243072509766, - "objective/scores": -0.5, - "policy/approxkl_avg": 300.677490234375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6953125, - "step": 50, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9997317790985107 - }, - { - "episode": 832, - "epoch": 0.004984961234736552, - "loss/policy_avg": 0.18666991591453552, - "lr": 9.967407975460123e-06, - "objective/entropy": -47.62429428100586, - "objective/kl": 13.258740425109863, - "objective/non_score_reward": -0.6629370450973511, - "objective/rlhf_reward": -1.0954889049201753, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 110.42059326171875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.685546875, - "step": 51, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9965816736221313 - }, - { - "episode": 848, - "epoch": 0.005080825873866101, - "loss/policy_avg": 0.195734903216362, - "lr": 9.96676891615542e-06, - "objective/entropy": 57.396114349365234, - "objective/kl": 15.980720520019531, - "objective/non_score_reward": -0.7990360856056213, - "objective/rlhf_reward": -5.196144104003906, - "objective/scores": -0.5, - "policy/approxkl_avg": 125.00595092773438, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.775390625, - "step": 52, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.995786428451538 - }, - { - "episode": 864, - "epoch": 0.00517669051299565, - "loss/policy_avg": 0.24296848475933075, - "lr": 9.966129856850717e-06, - "objective/entropy": 99.57502746582031, - "objective/kl": 17.62392807006836, - "objective/non_score_reward": -0.8811964988708496, - "objective/rlhf_reward": -2.0090143916928134, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 99.32807922363281, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.662109375, - "step": 53, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0016443729400635 - }, - { - "episode": 880, - "epoch": 0.0052725551521251995, - "loss/policy_avg": 0.5130509734153748, - "lr": 9.965490797546014e-06, - "objective/entropy": 34.0892219543457, - "objective/kl": 14.999124526977539, - "objective/non_score_reward": -0.7499562501907349, - "objective/rlhf_reward": -1.4840530840479695, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 130.593017578125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.75390625, - "step": 54, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000711679458618 - }, - { - "episode": 896, - "epoch": 0.0053684197912547485, - "loss/policy_avg": 0.3232521116733551, - "lr": 9.96485173824131e-06, - "objective/entropy": 69.26298522949219, - "objective/kl": 21.724315643310547, - "objective/non_score_reward": -1.086215853691101, - "objective/rlhf_reward": -2.788603871074274, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 234.98104858398438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.68359375, - "step": 55, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9952917098999023 - }, - { - "episode": 912, - "epoch": 0.0054642844303842975, - "loss/policy_avg": 0.10791392624378204, - "lr": 9.964212678936606e-06, - "objective/entropy": 32.22584533691406, - "objective/kl": 15.846414566040039, - "objective/non_score_reward": -0.7923207879066467, - "objective/rlhf_reward": 1.230716893076897, - "objective/scores": 1.1, - "policy/approxkl_avg": 256.9724426269531, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4736328125, - "step": 56, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984806776046753 - }, - { - "episode": 928, - "epoch": 0.005560149069513846, - "loss/policy_avg": 0.37206730246543884, - "lr": 9.963573619631903e-06, - "objective/entropy": -6.195688247680664, - "objective/kl": 12.801559448242188, - "objective/non_score_reward": -0.6400780081748962, - "objective/rlhf_reward": -2.5603120028972626, - "objective/scores": 0.0, - "policy/approxkl_avg": 40.83631896972656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.744140625, - "step": 57, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0007452964782715 - }, - { - "episode": 944, - "epoch": 0.005656013708643395, - "loss/policy_avg": 0.05091024935245514, - "lr": 9.9629345603272e-06, - "objective/entropy": -18.476280212402344, - "objective/kl": 18.95052719116211, - "objective/non_score_reward": -0.9475262761116028, - "objective/rlhf_reward": -3.790105164051056, - "objective/scores": 0.0, - "policy/approxkl_avg": 144.4001007080078, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.580078125, - "step": 58, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9988869428634644 - }, - { - "episode": 960, - "epoch": 0.005751878347772944, - "loss/policy_avg": -0.008312445133924484, - "lr": 9.962295501022495e-06, - "objective/entropy": 217.08169555664062, - "objective/kl": 14.908738136291504, - "objective/non_score_reward": -0.7454369068145752, - "objective/rlhf_reward": -2.9817477762699127, - "objective/scores": 0.0, - "policy/approxkl_avg": 93.395751953125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.77734375, - "step": 59, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9973504543304443 - }, - { - "episode": 976, - "epoch": 0.005847742986902493, - "loss/policy_avg": 0.03407389298081398, - "lr": 9.961656441717792e-06, - "objective/entropy": 140.58189392089844, - "objective/kl": 20.377920150756836, - "objective/non_score_reward": -1.0188961029052734, - "objective/rlhf_reward": -6.075584411621094, - "objective/scores": -0.5, - "policy/approxkl_avg": 66.34793090820312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.673828125, - "step": 60, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9996695518493652 - }, - { - "episode": 992, - "epoch": 0.005943607626032043, - "loss/policy_avg": 0.08645053207874298, - "lr": 9.961017382413088e-06, - "objective/entropy": 19.108230590820312, - "objective/kl": 13.644828796386719, - "objective/non_score_reward": -0.6822414994239807, - "objective/rlhf_reward": -2.7289658784866333, - "objective/scores": 0.0, - "policy/approxkl_avg": 35.71690368652344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.564453125, - "step": 61, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9962527751922607 - }, - { - "episode": 1008, - "epoch": 0.006039472265161592, - "loss/policy_avg": -0.17965860664844513, - "lr": 9.960378323108385e-06, - "objective/entropy": 168.96075439453125, - "objective/kl": 11.691057205200195, - "objective/non_score_reward": -0.5845528841018677, - "objective/rlhf_reward": -4.338212013244629, - "objective/scores": -0.5, - "policy/approxkl_avg": 65.8020248413086, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.775390625, - "step": 62, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0171313285827637 - }, - { - "episode": 1024, - "epoch": 0.006135336904291141, - "loss/policy_avg": 0.061459362506866455, - "lr": 9.959739263803682e-06, - "objective/entropy": 117.6607437133789, - "objective/kl": 15.35727310180664, - "objective/non_score_reward": -0.7678636312484741, - "objective/rlhf_reward": -3.071454644203186, - "objective/scores": 0.0, - "policy/approxkl_avg": 187.7376708984375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.9375, - "step": 63, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990172386169434 - }, - { - "episode": 1040, - "epoch": 0.00623120154342069, - "loss/policy_avg": 0.07200516015291214, - "lr": 9.959100204498979e-06, - "objective/entropy": 20.476089477539062, - "objective/kl": 13.475000381469727, - "objective/non_score_reward": -0.6737500429153442, - "objective/rlhf_reward": 1.7049996197223667, - "objective/scores": 1.1, - "policy/approxkl_avg": 68.92333984375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6796875, - "step": 64, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985809326171875 - }, - { - "episode": 1056, - "epoch": 0.006327066182550239, - "loss/policy_avg": 0.08365275710821152, - "lr": 9.958461145194274e-06, - "objective/entropy": -127.62371826171875, - "objective/kl": 22.050678253173828, - "objective/non_score_reward": -1.1025339365005493, - "objective/rlhf_reward": -4.410135626792908, - "objective/scores": 0.0, - "policy/approxkl_avg": 301.7841491699219, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.720703125, - "step": 65, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990687370300293 - }, - { - "episode": 1072, - "epoch": 0.006422930821679788, - "loss/policy_avg": 0.040758199989795685, - "lr": 9.957822085889571e-06, - "objective/entropy": 70.77458190917969, - "objective/kl": 18.2130069732666, - "objective/non_score_reward": -0.9106502532958984, - "objective/rlhf_reward": -1.9807416550522907, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 190.14797973632812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.439453125, - "step": 66, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984623193740845 - }, - { - "episode": 1088, - "epoch": 0.006518795460809337, - "loss/policy_avg": 0.05295582860708237, - "lr": 9.957183026584868e-06, - "objective/entropy": 97.35667419433594, - "objective/kl": 24.64842987060547, - "objective/non_score_reward": -1.2324215173721313, - "objective/rlhf_reward": -4.929685860872269, - "objective/scores": 0.0, - "policy/approxkl_avg": 394.2406921386719, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.67578125, - "step": 67, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0007405281066895 - }, - { - "episode": 1104, - "epoch": 0.006614660099938887, - "loss/policy_avg": 0.14266067743301392, - "lr": 9.956543967280165e-06, - "objective/entropy": 85.57185363769531, - "objective/kl": 14.76464557647705, - "objective/non_score_reward": -0.7382322549819946, - "objective/rlhf_reward": -0.029210095049115647, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 171.19406127929688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015625, - "step": 68, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9964725971221924 - }, - { - "episode": 1120, - "epoch": 0.006710524739068436, - "loss/policy_avg": 0.11469551920890808, - "lr": 9.955904907975462e-06, - "objective/entropy": 21.974023818969727, - "objective/kl": 16.630640029907227, - "objective/non_score_reward": -0.8315319418907166, - "objective/rlhf_reward": -3.326127827167511, - "objective/scores": 0.0, - "policy/approxkl_avg": 63.985679626464844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.72265625, - "step": 69, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999800682067871 - }, - { - "episode": 1136, - "epoch": 0.006806389378197985, - "loss/policy_avg": 0.10287429392337799, - "lr": 9.955265848670757e-06, - "objective/entropy": 43.38239288330078, - "objective/kl": 22.02418327331543, - "objective/non_score_reward": -1.101209282875061, - "objective/rlhf_reward": -4.404837071895599, - "objective/scores": 0.0, - "policy/approxkl_avg": 135.89984130859375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.69140625, - "step": 70, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9973747730255127 - }, - { - "episode": 1152, - "epoch": 0.0069022540173275335, - "loss/policy_avg": 2.0731570720672607, - "lr": 9.954626789366054e-06, - "objective/entropy": -7.300925254821777, - "objective/kl": 17.635089874267578, - "objective/non_score_reward": -0.8817545175552368, - "objective/rlhf_reward": -3.5270181000232697, - "objective/scores": 0.0, - "policy/approxkl_avg": 183.6417236328125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.65625, - "step": 71, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9976426362991333 - }, - { - "episode": 1168, - "epoch": 0.0069981186564570825, - "loss/policy_avg": 0.24466943740844727, - "lr": 9.95398773006135e-06, - "objective/entropy": -26.054595947265625, - "objective/kl": 15.905699729919434, - "objective/non_score_reward": -0.7952849864959717, - "objective/rlhf_reward": -5.181139945983887, - "objective/scores": -0.5, - "policy/approxkl_avg": 46.05584716796875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625, - "step": 72, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9955050945281982 - }, - { - "episode": 1184, - "epoch": 0.0070939832955866314, - "loss/policy_avg": 0.4031391739845276, - "lr": 9.953348670756648e-06, - "objective/entropy": -107.55976867675781, - "objective/kl": 19.68102264404297, - "objective/non_score_reward": -0.9840512275695801, - "objective/rlhf_reward": -3.936204746365547, - "objective/scores": 0.0, - "policy/approxkl_avg": 232.6634521484375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.751953125, - "step": 73, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.00066876411438 - }, - { - "episode": 1200, - "epoch": 0.00718984793471618, - "loss/policy_avg": 0.1890699565410614, - "lr": 9.952709611451944e-06, - "objective/entropy": 118.900146484375, - "objective/kl": 21.680133819580078, - "objective/non_score_reward": -1.084006667137146, - "objective/rlhf_reward": -6.336027145385742, - "objective/scores": -0.5, - "policy/approxkl_avg": 266.20465087890625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.720703125, - "step": 74, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997557282447815 - }, - { - "episode": 1216, - "epoch": 0.00728571257384573, - "loss/policy_avg": 0.383888304233551, - "lr": 9.952070552147241e-06, - "objective/entropy": 124.33120727539062, - "objective/kl": 21.27002716064453, - "objective/non_score_reward": -1.0635013580322266, - "objective/rlhf_reward": -4.254005193710327, - "objective/scores": 0.0, - "policy/approxkl_avg": 86.44483184814453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.611328125, - "step": 75, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9962941408157349 - }, - { - "episode": 1232, - "epoch": 0.007381577212975279, - "loss/policy_avg": 0.23960661888122559, - "lr": 9.951431492842536e-06, - "objective/entropy": 40.334468841552734, - "objective/kl": 17.827497482299805, - "objective/non_score_reward": -0.891374945640564, - "objective/rlhf_reward": -3.5654996633529663, - "objective/scores": 0.0, - "policy/approxkl_avg": 94.15713500976562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.548828125, - "step": 76, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984164237976074 - }, - { - "episode": 1248, - "epoch": 0.007477441852104828, - "loss/policy_avg": 0.4706483781337738, - "lr": 9.950792433537833e-06, - "objective/entropy": 106.07322692871094, - "objective/kl": 19.12630844116211, - "objective/non_score_reward": -0.9563154578208923, - "objective/rlhf_reward": -3.8252618312835693, - "objective/scores": 0.0, - "policy/approxkl_avg": 185.7378387451172, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.87890625, - "step": 77, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9966304302215576 - }, - { - "episode": 1264, - "epoch": 0.007573306491234377, - "loss/policy_avg": 0.0665474385023117, - "lr": 9.950153374233129e-06, - "objective/entropy": 34.984527587890625, - "objective/kl": 23.865880966186523, - "objective/non_score_reward": -1.1932940483093262, - "objective/rlhf_reward": -3.373176074028015, - "objective/scores": 0.35, - "policy/approxkl_avg": 336.36712646484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53515625, - "step": 78, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997634768486023 - }, - { - "episode": 1280, - "epoch": 0.007669171130363926, - "loss/policy_avg": 0.3842596113681793, - "lr": 9.949514314928425e-06, - "objective/entropy": 229.88047790527344, - "objective/kl": 27.72378921508789, - "objective/non_score_reward": -1.386189579963684, - "objective/rlhf_reward": -1.144758558273315, - "objective/scores": 1.1, - "policy/approxkl_avg": 524.0328369140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.720703125, - "step": 79, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.997296690940857 - }, - { - "episode": 1296, - "epoch": 0.007765035769493475, - "loss/policy_avg": 0.4347228705883026, - "lr": 9.948875255623722e-06, - "objective/entropy": -43.351566314697266, - "objective/kl": 18.37939453125, - "objective/non_score_reward": -0.9189697504043579, - "objective/rlhf_reward": -3.6758789718151093, - "objective/scores": 0.0, - "policy/approxkl_avg": 87.53759002685547, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.521484375, - "step": 80, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9968631267547607 - }, - { - "episode": 1312, - "epoch": 0.007860900408623025, - "loss/policy_avg": 0.5703809261322021, - "lr": 9.94823619631902e-06, - "objective/entropy": 182.94879150390625, - "objective/kl": 24.6871337890625, - "objective/non_score_reward": -1.2343567609786987, - "objective/rlhf_reward": -6.937427043914795, - "objective/scores": -0.5, - "policy/approxkl_avg": 274.44744873046875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.71875, - "step": 81, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9995486736297607 - }, - { - "episode": 1328, - "epoch": 0.007956765047752574, - "loss/policy_avg": 0.10641711950302124, - "lr": 9.947597137014316e-06, - "objective/entropy": 73.29893493652344, - "objective/kl": 17.603548049926758, - "objective/non_score_reward": -0.88017737865448, - "objective/rlhf_reward": -3.52070951461792, - "objective/scores": 0.0, - "policy/approxkl_avg": 123.0771255493164, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.505859375, - "step": 82, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998863935470581 - }, - { - "episode": 1344, - "epoch": 0.008052629686882123, - "loss/policy_avg": 0.12928956747055054, - "lr": 9.946958077709611e-06, - "objective/entropy": 108.6548080444336, - "objective/kl": 18.59684944152832, - "objective/non_score_reward": -0.929842472076416, - "objective/rlhf_reward": -2.3407678390420497, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 120.68421936035156, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.57421875, - "step": 83, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998254656791687 - }, - { - "episode": 1360, - "epoch": 0.008148494326011672, - "loss/policy_avg": 0.14865761995315552, - "lr": 9.946319018404908e-06, - "objective/entropy": 102.67412567138672, - "objective/kl": 23.651020050048828, - "objective/non_score_reward": -1.1825510263442993, - "objective/rlhf_reward": -4.7302040457725525, - "objective/scores": 0.0, - "policy/approxkl_avg": 73.66981506347656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4033203125, - "step": 84, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000584602355957 - }, - { - "episode": 1376, - "epoch": 0.00824435896514122, - "loss/policy_avg": 0.07400541007518768, - "lr": 9.945679959100205e-06, - "objective/entropy": 133.18292236328125, - "objective/kl": 11.912694931030273, - "objective/non_score_reward": -0.5956346988677979, - "objective/rlhf_reward": -0.4351277453469593, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 49.34624099731445, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.59375, - "step": 85, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000187397003174 - }, - { - "episode": 1392, - "epoch": 0.00834022360427077, - "loss/policy_avg": 0.13357847929000854, - "lr": 9.945040899795502e-06, - "objective/entropy": 112.34770202636719, - "objective/kl": 20.725894927978516, - "objective/non_score_reward": -1.0362948179244995, - "objective/rlhf_reward": -2.7665772224343836, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 157.26473999023438, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.56640625, - "step": 86, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9963226318359375 - }, - { - "episode": 1408, - "epoch": 0.008436088243400319, - "loss/policy_avg": 0.32753437757492065, - "lr": 9.944401840490799e-06, - "objective/entropy": 43.2598762512207, - "objective/kl": 19.98666000366211, - "objective/non_score_reward": -0.9993331432342529, - "objective/rlhf_reward": -5.997332572937012, - "objective/scores": -0.5, - "policy/approxkl_avg": 257.4547424316406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.599609375, - "step": 87, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9966068267822266 - }, - { - "episode": 1424, - "epoch": 0.008531952882529868, - "loss/policy_avg": 0.09795168787240982, - "lr": 9.943762781186096e-06, - "objective/entropy": -59.364646911621094, - "objective/kl": 14.953709602355957, - "objective/non_score_reward": -0.7476855516433716, - "objective/rlhf_reward": -4.990742206573486, - "objective/scores": -0.5, - "policy/approxkl_avg": 55.110633850097656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.640625, - "step": 88, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9969828128814697 - }, - { - "episode": 1440, - "epoch": 0.008627817521659416, - "loss/policy_avg": 0.06303240358829498, - "lr": 9.94312372188139e-06, - "objective/entropy": 50.4556770324707, - "objective/kl": 19.505146026611328, - "objective/non_score_reward": -0.9752573370933533, - "objective/rlhf_reward": -3.9010292887687683, - "objective/scores": 0.0, - "policy/approxkl_avg": 34.922752380371094, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.607421875, - "step": 89, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9964232444763184 - }, - { - "episode": 1456, - "epoch": 0.008723682160788965, - "loss/policy_avg": 0.2796894907951355, - "lr": 9.942484662576688e-06, - "objective/entropy": 135.44993591308594, - "objective/kl": 22.230022430419922, - "objective/non_score_reward": -1.1115009784698486, - "objective/rlhf_reward": -4.446004092693329, - "objective/scores": 0.0, - "policy/approxkl_avg": 48.1524658203125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.56640625, - "step": 90, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998495101928711 - }, - { - "episode": 1472, - "epoch": 0.008819546799918514, - "loss/policy_avg": 0.12483496963977814, - "lr": 9.941845603271985e-06, - "objective/entropy": 148.17709350585938, - "objective/kl": 17.60011100769043, - "objective/non_score_reward": -0.8800055384635925, - "objective/rlhf_reward": -5.520022392272949, - "objective/scores": -0.5, - "policy/approxkl_avg": 142.25204467773438, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6171875, - "step": 91, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998748779296875 - }, - { - "episode": 1488, - "epoch": 0.008915411439048063, - "loss/policy_avg": 0.0684453696012497, - "lr": 9.941206543967281e-06, - "objective/entropy": 25.60771942138672, - "objective/kl": 17.530319213867188, - "objective/non_score_reward": -0.8765159845352173, - "objective/rlhf_reward": -5.506063938140869, - "objective/scores": -0.5, - "policy/approxkl_avg": 70.80863189697266, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.625, - "step": 92, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999198913574219 - }, - { - "episode": 1504, - "epoch": 0.009011276078177614, - "loss/policy_avg": 0.13488999009132385, - "lr": 9.940567484662578e-06, - "objective/entropy": -75.2538070678711, - "objective/kl": 19.696504592895508, - "objective/non_score_reward": -0.9848252534866333, - "objective/rlhf_reward": -3.939300984144211, - "objective/scores": 0.0, - "policy/approxkl_avg": 214.8182373046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.515625, - "step": 93, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989702701568604 - }, - { - "episode": 1520, - "epoch": 0.009107140717307163, - "loss/policy_avg": 0.02409663423895836, - "lr": 9.939928425357874e-06, - "objective/entropy": 8.831840515136719, - "objective/kl": 25.456069946289062, - "objective/non_score_reward": -1.272803544998169, - "objective/rlhf_reward": -3.5754421589695777, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 176.86953735351562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4873046875, - "step": 94, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9964553117752075 - }, - { - "episode": 1536, - "epoch": 0.009203005356436712, - "loss/policy_avg": 0.0426328219473362, - "lr": 9.93928936605317e-06, - "objective/entropy": 185.92372131347656, - "objective/kl": 19.176239013671875, - "objective/non_score_reward": -0.95881187915802, - "objective/rlhf_reward": -3.8352474570274353, - "objective/scores": 0.0, - "policy/approxkl_avg": 269.95849609375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70703125, - "step": 95, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9930847883224487 - }, - { - "episode": 1552, - "epoch": 0.009298869995566261, - "loss/policy_avg": 0.3135666251182556, - "lr": 9.938650306748467e-06, - "objective/entropy": -119.88722229003906, - "objective/kl": 18.911632537841797, - "objective/non_score_reward": -0.9455816745758057, - "objective/rlhf_reward": -3.782326579093933, - "objective/scores": 0.0, - "policy/approxkl_avg": 136.56689453125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.630859375, - "step": 96, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0003180503845215 - }, - { - "episode": 1568, - "epoch": 0.00939473463469581, - "loss/policy_avg": 0.1893162876367569, - "lr": 9.938011247443764e-06, - "objective/entropy": 179.721435546875, - "objective/kl": 21.26153564453125, - "objective/non_score_reward": -1.0630767345428467, - "objective/rlhf_reward": -1.3285883411180701, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 203.61773681640625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.70703125, - "step": 97, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.994171142578125 - }, - { - "episode": 1584, - "epoch": 0.009490599273825359, - "loss/policy_avg": 0.5632504224777222, - "lr": 9.937372188139061e-06, - "objective/entropy": 3.3514366149902344, - "objective/kl": 19.21142578125, - "objective/non_score_reward": -0.9605712890625, - "objective/rlhf_reward": -2.483035289977474, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 74.39619445800781, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6953125, - "step": 98, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99905526638031 - }, - { - "episode": 1600, - "epoch": 0.009586463912954908, - "loss/policy_avg": 0.20837292075157166, - "lr": 9.936733128834358e-06, - "objective/entropy": 121.03665161132812, - "objective/kl": 13.999438285827637, - "objective/non_score_reward": -0.6999719142913818, - "objective/rlhf_reward": -4.799887657165527, - "objective/scores": -0.5, - "policy/approxkl_avg": 32.508689880371094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.375, - "step": 99, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9994678497314453 - }, - { - "episode": 1616, - "epoch": 0.009682328552084457, - "loss/policy_avg": 0.2726283669471741, - "lr": 9.936094069529653e-06, - "objective/entropy": 110.09475708007812, - "objective/kl": 15.960447311401367, - "objective/non_score_reward": -0.798022449016571, - "objective/rlhf_reward": -3.192089796066284, - "objective/scores": 0.0, - "policy/approxkl_avg": 61.767425537109375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.55859375, - "step": 100, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0027127265930176 - }, - { - "episode": 1632, - "epoch": 0.009778193191214006, - "loss/policy_avg": 0.2845292091369629, - "lr": 9.93545501022495e-06, - "objective/entropy": -153.4110107421875, - "objective/kl": 16.05643081665039, - "objective/non_score_reward": -0.8028215765953064, - "objective/rlhf_reward": 1.1887137234210972, - "objective/scores": 1.1, - "policy/approxkl_avg": 95.32630157470703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.669921875, - "step": 101, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.997380256652832 - }, - { - "episode": 1648, - "epoch": 0.009874057830343555, - "loss/policy_avg": 0.18602727353572845, - "lr": 9.934815950920245e-06, - "objective/entropy": -13.683324813842773, - "objective/kl": 23.494054794311523, - "objective/non_score_reward": -1.174702763557434, - "objective/rlhf_reward": -6.698811054229736, - "objective/scores": -0.5, - "policy/approxkl_avg": 145.7095947265625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4912109375, - "step": 102, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994981288909912 - }, - { - "episode": 1664, - "epoch": 0.009969922469473104, - "loss/policy_avg": 0.2709546983242035, - "lr": 9.934176891615542e-06, - "objective/entropy": -101.46907043457031, - "objective/kl": 22.274028778076172, - "objective/non_score_reward": -1.113701581954956, - "objective/rlhf_reward": -2.507394979672368, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 308.6561584472656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4912109375, - "step": 103, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9981049299240112 - }, - { - "episode": 1680, - "epoch": 0.010065787108602653, - "loss/policy_avg": 0.0334465391933918, - "lr": 9.933537832310839e-06, - "objective/entropy": 130.1453857421875, - "objective/kl": 29.715213775634766, - "objective/non_score_reward": -1.4857605695724487, - "objective/rlhf_reward": -4.601406863241821, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 351.42138671875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5234375, - "step": 104, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998427152633667 - }, - { - "episode": 1696, - "epoch": 0.010161651747732202, - "loss/policy_avg": 0.24828088283538818, - "lr": 9.932898773006136e-06, - "objective/entropy": 125.98516082763672, - "objective/kl": 15.857706069946289, - "objective/non_score_reward": -0.7928853034973145, - "objective/rlhf_reward": -1.720943163247451, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 74.20083618164062, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.578125, - "step": 105, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993948936462402 - }, - { - "episode": 1712, - "epoch": 0.01025751638686175, - "loss/policy_avg": 0.2954871356487274, - "lr": 9.932259713701433e-06, - "objective/entropy": 97.68868255615234, - "objective/kl": 12.135580062866211, - "objective/non_score_reward": -0.6067790389060974, - "objective/rlhf_reward": -2.4271161258220673, - "objective/scores": 0.0, - "policy/approxkl_avg": 67.66595458984375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.59375, - "step": 106, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9992032051086426 - }, - { - "episode": 1728, - "epoch": 0.0103533810259913, - "loss/policy_avg": 0.10418711602687836, - "lr": 9.931620654396728e-06, - "objective/entropy": -43.816890716552734, - "objective/kl": 19.110689163208008, - "objective/non_score_reward": -0.9555345773696899, - "objective/rlhf_reward": -5.82213830947876, - "objective/scores": -0.5, - "policy/approxkl_avg": 160.15283203125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6796875, - "step": 107, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9975645542144775 - }, - { - "episode": 1744, - "epoch": 0.01044924566512085, - "loss/policy_avg": 0.23229390382766724, - "lr": 9.930981595092025e-06, - "objective/entropy": 91.57461547851562, - "objective/kl": 18.9378662109375, - "objective/non_score_reward": -0.9468932747840881, - "objective/rlhf_reward": -3.787573218345642, - "objective/scores": 0.0, - "policy/approxkl_avg": 155.35989379882812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.66796875, - "step": 108, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9963436126708984 - }, - { - "episode": 1760, - "epoch": 0.010545110304250399, - "loss/policy_avg": 0.3382238447666168, - "lr": 9.930342535787322e-06, - "objective/entropy": -49.52970886230469, - "objective/kl": 17.919204711914062, - "objective/non_score_reward": -0.89596027135849, - "objective/rlhf_reward": -5.583841323852539, - "objective/scores": -0.5, - "policy/approxkl_avg": 151.97140502929688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.73828125, - "step": 109, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9960722923278809 - }, - { - "episode": 1776, - "epoch": 0.010640974943379948, - "loss/policy_avg": 0.16102033853530884, - "lr": 9.929703476482619e-06, - "objective/entropy": -40.16828155517578, - "objective/kl": 15.826179504394531, - "objective/non_score_reward": -0.7913089990615845, - "objective/rlhf_reward": -3.1652360260486603, - "objective/scores": 0.0, - "policy/approxkl_avg": 32.21597671508789, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.767578125, - "step": 110, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9961647987365723 - }, - { - "episode": 1792, - "epoch": 0.010736839582509497, - "loss/policy_avg": 0.08855805546045303, - "lr": 9.929064417177915e-06, - "objective/entropy": 187.74282836914062, - "objective/kl": 22.12034797668457, - "objective/non_score_reward": -1.1060173511505127, - "objective/rlhf_reward": -6.424069404602051, - "objective/scores": -0.5, - "policy/approxkl_avg": 89.00162506103516, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.80859375, - "step": 111, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9971915483474731 - }, - { - "episode": 1808, - "epoch": 0.010832704221639046, - "loss/policy_avg": 0.3315132260322571, - "lr": 9.928425357873212e-06, - "objective/entropy": -130.41551208496094, - "objective/kl": 20.600021362304688, - "objective/non_score_reward": -1.030001163482666, - "objective/rlhf_reward": -2.7414021278298915, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 231.54774475097656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.671875, - "step": 112, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0009350776672363 - }, - { - "episode": 1824, - "epoch": 0.010928568860768595, - "loss/policy_avg": 0.5379164814949036, - "lr": 9.927786298568507e-06, - "objective/entropy": 122.76021575927734, - "objective/kl": 24.667219161987305, - "objective/non_score_reward": -1.2333608865737915, - "objective/rlhf_reward": -6.933443546295166, - "objective/scores": -0.5, - "policy/approxkl_avg": 214.82647705078125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4638671875, - "step": 113, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9977924823760986 - }, - { - "episode": 1840, - "epoch": 0.011024433499898144, - "loss/policy_avg": 0.19039300084114075, - "lr": 9.927147239263804e-06, - "objective/entropy": -26.283668518066406, - "objective/kl": 21.043611526489258, - "objective/non_score_reward": -1.0521806478500366, - "objective/rlhf_reward": -2.6929507491909828, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 337.57025146484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 114, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9954912662506104 - }, - { - "episode": 1856, - "epoch": 0.011120298139027693, - "loss/policy_avg": 0.030586296692490578, - "lr": 9.926508179959101e-06, - "objective/entropy": 189.2314910888672, - "objective/kl": 18.47957992553711, - "objective/non_score_reward": -0.9239791035652161, - "objective/rlhf_reward": -3.6959164142608643, - "objective/scores": 0.0, - "policy/approxkl_avg": 158.6993865966797, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8671875, - "step": 115, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987783432006836 - }, - { - "episode": 1872, - "epoch": 0.011216162778157242, - "loss/policy_avg": 0.23665881156921387, - "lr": 9.925869120654398e-06, - "objective/entropy": 73.73204803466797, - "objective/kl": 19.052127838134766, - "objective/non_score_reward": -0.9526063203811646, - "objective/rlhf_reward": -5.810425281524658, - "objective/scores": -0.5, - "policy/approxkl_avg": 45.58797073364258, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.55859375, - "step": 116, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978525638580322 - }, - { - "episode": 1888, - "epoch": 0.01131202741728679, - "loss/policy_avg": 0.11659398674964905, - "lr": 9.925230061349695e-06, - "objective/entropy": 128.39474487304688, - "objective/kl": 25.3045597076416, - "objective/non_score_reward": -1.265228033065796, - "objective/rlhf_reward": -2.6609121322631832, - "objective/scores": 0.6, - "policy/approxkl_avg": 76.14613342285156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.64453125, - "step": 117, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9966604709625244 - }, - { - "episode": 1904, - "epoch": 0.01140789205641634, - "loss/policy_avg": 0.19203245639801025, - "lr": 9.92459100204499e-06, - "objective/entropy": 57.626686096191406, - "objective/kl": 30.407909393310547, - "objective/non_score_reward": -1.5203955173492432, - "objective/rlhf_reward": -1.6815817117691036, - "objective/scores": 1.1, - "policy/approxkl_avg": 324.47161865234375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.544921875, - "step": 118, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0000627040863037 - }, - { - "episode": 1920, - "epoch": 0.011503756695545889, - "loss/policy_avg": -0.02956828847527504, - "lr": 9.923951942740287e-06, - "objective/entropy": 160.20449829101562, - "objective/kl": 13.33430290222168, - "objective/non_score_reward": -0.666715145111084, - "objective/rlhf_reward": -2.666860580444336, - "objective/scores": 0.0, - "policy/approxkl_avg": 62.17939758300781, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.587890625, - "step": 119, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.003843307495117 - }, - { - "episode": 1936, - "epoch": 0.011599621334675438, - "loss/policy_avg": 0.3666956424713135, - "lr": 9.923312883435584e-06, - "objective/entropy": 173.73385620117188, - "objective/kl": 25.82461929321289, - "objective/non_score_reward": -1.2912311553955078, - "objective/rlhf_reward": -7.164924621582031, - "objective/scores": -0.5, - "policy/approxkl_avg": 248.4417724609375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.71484375, - "step": 120, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0001635551452637 - }, - { - "episode": 1952, - "epoch": 0.011695485973804987, - "loss/policy_avg": 0.07095953077077866, - "lr": 9.92267382413088e-06, - "objective/entropy": 60.89289855957031, - "objective/kl": 21.512653350830078, - "objective/non_score_reward": -1.0756325721740723, - "objective/rlhf_reward": -2.851932506175384, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 187.26104736328125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.654296875, - "step": 121, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9986932277679443 - }, - { - "episode": 1968, - "epoch": 0.011791350612934537, - "loss/policy_avg": 0.11872611939907074, - "lr": 9.922034764826178e-06, - "objective/entropy": -24.511760711669922, - "objective/kl": 22.253305435180664, - "objective/non_score_reward": -1.1126651763916016, - "objective/rlhf_reward": -4.450661063194275, - "objective/scores": 0.0, - "policy/approxkl_avg": 199.84397888183594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484375, - "step": 122, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9950015544891357 - }, - { - "episode": 1984, - "epoch": 0.011887215252064086, - "loss/policy_avg": 0.5726426839828491, - "lr": 9.921395705521473e-06, - "objective/entropy": 102.35612487792969, - "objective/kl": 32.768287658691406, - "objective/non_score_reward": -1.6384142637252808, - "objective/rlhf_reward": -8.553656578063965, - "objective/scores": -0.5, - "policy/approxkl_avg": 327.3544921875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.533203125, - "step": 123, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.995248556137085 - }, - { - "episode": 2000, - "epoch": 0.011983079891193635, - "loss/policy_avg": 0.19069992005825043, - "lr": 9.92075664621677e-06, - "objective/entropy": 7.145952224731445, - "objective/kl": 17.727392196655273, - "objective/non_score_reward": -0.8863697052001953, - "objective/rlhf_reward": -3.545478705316782, - "objective/scores": 0.0, - "policy/approxkl_avg": 26.600868225097656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.57421875, - "step": 124, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0011134147644043 - }, - { - "episode": 2016, - "epoch": 0.012078944530323184, - "loss/policy_avg": 0.2572447657585144, - "lr": 9.920117586912067e-06, - "objective/entropy": 109.04229736328125, - "objective/kl": 17.79098129272461, - "objective/non_score_reward": -0.8895490765571594, - "objective/rlhf_reward": -5.558196067810059, - "objective/scores": -0.5, - "policy/approxkl_avg": 111.66732788085938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.763671875, - "step": 125, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9968197345733643 - }, - { - "episode": 2032, - "epoch": 0.012174809169452733, - "loss/policy_avg": 0.043444547802209854, - "lr": 9.919478527607362e-06, - "objective/entropy": 75.83810424804688, - "objective/kl": 27.20602798461914, - "objective/non_score_reward": -1.3603014945983887, - "objective/rlhf_reward": -2.51748690450308, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 274.793701171875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.521484375, - "step": 126, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9991501569747925 - }, - { - "episode": 2048, - "epoch": 0.012270673808582282, - "loss/policy_avg": 0.2138219177722931, - "lr": 9.918839468302659e-06, - "objective/entropy": 21.247840881347656, - "objective/kl": 14.299978256225586, - "objective/non_score_reward": -0.7149989008903503, - "objective/rlhf_reward": -2.8599955439567566, - "objective/scores": 0.0, - "policy/approxkl_avg": 21.416780471801758, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.80078125, - "step": 127, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998870849609375 - }, - { - "episode": 2064, - "epoch": 0.012366538447711831, - "loss/policy_avg": 0.23010344803333282, - "lr": 9.918200408997956e-06, - "objective/entropy": -76.91316223144531, - "objective/kl": 13.382017135620117, - "objective/non_score_reward": -0.6691007614135742, - "objective/rlhf_reward": -2.676403224468231, - "objective/scores": 0.0, - "policy/approxkl_avg": 37.060523986816406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.677734375, - "step": 128, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998915195465088 - }, - { - "episode": 2080, - "epoch": 0.01246240308684138, - "loss/policy_avg": 0.4017820954322815, - "lr": 9.917561349693252e-06, - "objective/entropy": 198.82456970214844, - "objective/kl": 22.337753295898438, - "objective/non_score_reward": -1.1168878078460693, - "objective/rlhf_reward": -2.9112917771011144, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 41.49570846557617, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.59765625, - "step": 129, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.001622200012207 - }, - { - "episode": 2096, - "epoch": 0.012558267725970929, - "loss/policy_avg": 0.45664405822753906, - "lr": 9.91692229038855e-06, - "objective/entropy": 96.67610168457031, - "objective/kl": 13.830822944641113, - "objective/non_score_reward": -0.6915411353111267, - "objective/rlhf_reward": -2.7661644518375397, - "objective/scores": 0.0, - "policy/approxkl_avg": 49.5977783203125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.87890625, - "step": 130, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0018720626831055 - }, - { - "episode": 2112, - "epoch": 0.012654132365100478, - "loss/policy_avg": 0.18199189007282257, - "lr": 9.916283231083844e-06, - "objective/entropy": 12.164558410644531, - "objective/kl": 17.693878173828125, - "objective/non_score_reward": -0.8846939206123352, - "objective/rlhf_reward": -3.538775682449341, - "objective/scores": 0.0, - "policy/approxkl_avg": 3.7435173988342285, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.62890625, - "step": 131, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0015268325805664 - }, - { - "episode": 2128, - "epoch": 0.012749997004230027, - "loss/policy_avg": 0.21469825506210327, - "lr": 9.915644171779141e-06, - "objective/entropy": -11.569038391113281, - "objective/kl": 14.204147338867188, - "objective/non_score_reward": -0.7102073431015015, - "objective/rlhf_reward": -1.481579744552059, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 91.71839141845703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 132, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0028629302978516 - }, - { - "episode": 2144, - "epoch": 0.012845861643359576, - "loss/policy_avg": 0.27063143253326416, - "lr": 9.915005112474438e-06, - "objective/entropy": 180.4578857421875, - "objective/kl": 24.935741424560547, - "objective/non_score_reward": -1.2467870712280273, - "objective/rlhf_reward": -6.987148284912109, - "objective/scores": -0.5, - "policy/approxkl_avg": 138.678955078125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.529296875, - "step": 133, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9967637062072754 - }, - { - "episode": 2160, - "epoch": 0.012941726282489125, - "loss/policy_avg": 0.1394023448228836, - "lr": 9.914366053169735e-06, - "objective/entropy": -29.98552703857422, - "objective/kl": 13.385698318481445, - "objective/non_score_reward": -0.6692849397659302, - "objective/rlhf_reward": -2.6771397292613983, - "objective/scores": 0.0, - "policy/approxkl_avg": 73.47354125976562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6171875, - "step": 134, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000580310821533 - }, - { - "episode": 2176, - "epoch": 0.013037590921618674, - "loss/policy_avg": 0.0048561920411884785, - "lr": 9.913726993865032e-06, - "objective/entropy": 88.89292907714844, - "objective/kl": 28.03160858154297, - "objective/non_score_reward": -1.4015804529190063, - "objective/rlhf_reward": -4.227719643203121, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 370.16766357421875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.666015625, - "step": 135, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.995047926902771 - }, - { - "episode": 2192, - "epoch": 0.013133455560748224, - "loss/policy_avg": 0.1565648913383484, - "lr": 9.913087934560329e-06, - "objective/entropy": 48.28108596801758, - "objective/kl": 22.514755249023438, - "objective/non_score_reward": -1.1257379055023193, - "objective/rlhf_reward": -0.10295168161392176, - "objective/scores": 1.1, - "policy/approxkl_avg": 272.63470458984375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.58203125, - "step": 136, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0003504753112793 - }, - { - "episode": 2208, - "epoch": 0.013229320199877773, - "loss/policy_avg": 0.1350177526473999, - "lr": 9.912448875255624e-06, - "objective/entropy": 184.51797485351562, - "objective/kl": 30.795909881591797, - "objective/non_score_reward": -1.5397955179214478, - "objective/rlhf_reward": -8.159181594848633, - "objective/scores": -0.5, - "policy/approxkl_avg": 407.7762145996094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.578125, - "step": 137, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.997571349143982 - }, - { - "episode": 2224, - "epoch": 0.013325184839007322, - "loss/policy_avg": 0.2587956190109253, - "lr": 9.911809815950921e-06, - "objective/entropy": 14.785064697265625, - "objective/kl": 23.858671188354492, - "objective/non_score_reward": -1.1929335594177246, - "objective/rlhf_reward": -4.77173438668251, - "objective/scores": 0.0, - "policy/approxkl_avg": 258.4976501464844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.58984375, - "step": 138, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.996340036392212 - }, - { - "episode": 2240, - "epoch": 0.013421049478136871, - "loss/policy_avg": 0.03932709991931915, - "lr": 9.911170756646218e-06, - "objective/entropy": -88.26953887939453, - "objective/kl": 11.428003311157227, - "objective/non_score_reward": -0.5714001655578613, - "objective/rlhf_reward": -4.285600662231445, - "objective/scores": -0.5, - "policy/approxkl_avg": 41.424224853515625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.775390625, - "step": 139, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0010673999786377 - }, - { - "episode": 2256, - "epoch": 0.01351691411726642, - "loss/policy_avg": 0.20215287804603577, - "lr": 9.910531697341515e-06, - "objective/entropy": 66.65933227539062, - "objective/kl": 20.929710388183594, - "objective/non_score_reward": -1.046485424041748, - "objective/rlhf_reward": 0.2140582442283634, - "objective/scores": 1.1, - "policy/approxkl_avg": 124.57344055175781, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7421875, - "step": 140, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977262020111084 - }, - { - "episode": 2272, - "epoch": 0.01361277875639597, - "loss/policy_avg": 0.5699018836021423, - "lr": 9.909892638036812e-06, - "objective/entropy": 10.020034790039062, - "objective/kl": 16.74536895751953, - "objective/non_score_reward": -0.8372684717178345, - "objective/rlhf_reward": 1.0509260237216953, - "objective/scores": 1.1, - "policy/approxkl_avg": 193.53329467773438, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.640625, - "step": 141, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9986698627471924 - }, - { - "episode": 2288, - "epoch": 0.013708643395525518, - "loss/policy_avg": 0.15781471133232117, - "lr": 9.909253578732107e-06, - "objective/entropy": -44.31187438964844, - "objective/kl": 30.742799758911133, - "objective/non_score_reward": -1.537139892578125, - "objective/rlhf_reward": -8.1485595703125, - "objective/scores": -0.5, - "policy/approxkl_avg": 158.5760498046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.73828125, - "step": 142, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9987871646881104 - }, - { - "episode": 2304, - "epoch": 0.013804508034655067, - "loss/policy_avg": 0.09526471048593521, - "lr": 9.908614519427404e-06, - "objective/entropy": 52.471221923828125, - "objective/kl": 19.550655364990234, - "objective/non_score_reward": -0.9775327444076538, - "objective/rlhf_reward": -2.4595327778771967, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 176.07566833496094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.91796875, - "step": 143, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9970924854278564 - }, - { - "episode": 2320, - "epoch": 0.013900372673784616, - "loss/policy_avg": 0.03243420645594597, - "lr": 9.9079754601227e-06, - "objective/entropy": 128.66928100585938, - "objective/kl": 21.24932861328125, - "objective/non_score_reward": -1.0624663829803467, - "objective/rlhf_reward": -1.849865472316742, - "objective/scores": 0.6, - "policy/approxkl_avg": 188.10623168945312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.470703125, - "step": 144, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9996135234832764 - }, - { - "episode": 2336, - "epoch": 0.013996237312914165, - "loss/policy_avg": 0.25250673294067383, - "lr": 9.907336400817996e-06, - "objective/entropy": 198.611083984375, - "objective/kl": 21.650169372558594, - "objective/non_score_reward": -1.0825085639953613, - "objective/rlhf_reward": -6.330034255981445, - "objective/scores": -0.5, - "policy/approxkl_avg": 130.0052490234375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.892578125, - "step": 145, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994676113128662 - }, - { - "episode": 2352, - "epoch": 0.014092101952043714, - "loss/policy_avg": -0.034668684005737305, - "lr": 9.906697341513293e-06, - "objective/entropy": -27.681907653808594, - "objective/kl": 26.334529876708984, - "objective/non_score_reward": -1.316726565361023, - "objective/rlhf_reward": -3.907656395171566, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 210.81857299804688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.552734375, - "step": 146, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9972717761993408 - }, - { - "episode": 2368, - "epoch": 0.014187966591173263, - "loss/policy_avg": 0.12787118554115295, - "lr": 9.90605828220859e-06, - "objective/entropy": -43.486568450927734, - "objective/kl": 20.714540481567383, - "objective/non_score_reward": -1.0357270240783691, - "objective/rlhf_reward": -4.142907917499542, - "objective/scores": 0.0, - "policy/approxkl_avg": 86.59358978271484, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.673828125, - "step": 147, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999436378479004 - }, - { - "episode": 2384, - "epoch": 0.014283831230302812, - "loss/policy_avg": 0.03302329033613205, - "lr": 9.905419222903886e-06, - "objective/entropy": 173.50836181640625, - "objective/kl": 23.32859230041504, - "objective/non_score_reward": -1.1664297580718994, - "objective/rlhf_reward": -3.3402058220206925, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 224.7312774658203, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.759765625, - "step": 148, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9986599683761597 - }, - { - "episode": 2400, - "epoch": 0.01437969586943236, - "loss/policy_avg": 0.04894339293241501, - "lr": 9.904780163599183e-06, - "objective/entropy": 48.957122802734375, - "objective/kl": 25.618064880371094, - "objective/non_score_reward": -1.2809032201766968, - "objective/rlhf_reward": -3.699780781467525, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 230.8973388671875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.576171875, - "step": 149, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9963643550872803 - }, - { - "episode": 2416, - "epoch": 0.01447556050856191, - "loss/policy_avg": 0.36329030990600586, - "lr": 9.904141104294478e-06, - "objective/entropy": 157.50445556640625, - "objective/kl": 23.93838882446289, - "objective/non_score_reward": -1.1969194412231445, - "objective/rlhf_reward": -4.787678003311157, - "objective/scores": 0.0, - "policy/approxkl_avg": 200.30621337890625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484375, - "step": 150, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9953033924102783 - }, - { - "episode": 2432, - "epoch": 0.01457142514769146, - "loss/policy_avg": 0.3649589419364929, - "lr": 9.903502044989775e-06, - "objective/entropy": 213.43943786621094, - "objective/kl": 19.777463912963867, - "objective/non_score_reward": -0.9888731837272644, - "objective/rlhf_reward": -2.6138571410471494, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 81.07418060302734, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.677734375, - "step": 151, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0006325244903564 - }, - { - "episode": 2448, - "epoch": 0.01466728978682101, - "loss/policy_avg": 0.35868164896965027, - "lr": 9.902862985685072e-06, - "objective/entropy": 2.8281936645507812, - "objective/kl": 23.836688995361328, - "objective/non_score_reward": -1.191834568977356, - "objective/rlhf_reward": -6.767337799072266, - "objective/scores": -0.5, - "policy/approxkl_avg": 91.45345306396484, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5703125, - "step": 152, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9970684051513672 - }, - { - "episode": 2464, - "epoch": 0.014763154425950558, - "loss/policy_avg": 0.07516692578792572, - "lr": 9.902223926380369e-06, - "objective/entropy": 171.71034240722656, - "objective/kl": 21.95725440979004, - "objective/non_score_reward": -1.097862720489502, - "objective/rlhf_reward": -4.3914510905742645, - "objective/scores": 0.0, - "policy/approxkl_avg": 221.61212158203125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53515625, - "step": 153, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9973832368850708 - }, - { - "episode": 2480, - "epoch": 0.014859019065080107, - "loss/policy_avg": 0.01844581961631775, - "lr": 9.901584867075666e-06, - "objective/entropy": 64.6440200805664, - "objective/kl": 17.436233520507812, - "objective/non_score_reward": -0.8718117475509644, - "objective/rlhf_reward": -5.487246990203857, - "objective/scores": -0.5, - "policy/approxkl_avg": 70.52445983886719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.755859375, - "step": 154, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000408172607422 - }, - { - "episode": 2496, - "epoch": 0.014954883704209656, - "loss/policy_avg": 0.05899505689740181, - "lr": 9.900945807770961e-06, - "objective/entropy": 123.7980728149414, - "objective/kl": 25.07213592529297, - "objective/non_score_reward": -1.253606915473938, - "objective/rlhf_reward": -7.014427661895752, - "objective/scores": -0.5, - "policy/approxkl_avg": 88.28120422363281, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.51171875, - "step": 155, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000736951828003 - }, - { - "episode": 2512, - "epoch": 0.015050748343339205, - "loss/policy_avg": 0.4545804560184479, - "lr": 9.900306748466258e-06, - "objective/entropy": 10.871131896972656, - "objective/kl": 26.028505325317383, - "objective/non_score_reward": -1.3014252185821533, - "objective/rlhf_reward": -5.205701023340225, - "objective/scores": 0.0, - "policy/approxkl_avg": 217.84939575195312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5859375, - "step": 156, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9983347654342651 - }, - { - "episode": 2528, - "epoch": 0.015146612982468754, - "loss/policy_avg": 0.4191577136516571, - "lr": 9.899667689161555e-06, - "objective/entropy": 109.52301025390625, - "objective/kl": 27.344154357910156, - "objective/non_score_reward": -1.3672077655792236, - "objective/rlhf_reward": -7.4688310623168945, - "objective/scores": -0.5, - "policy/approxkl_avg": 89.15927124023438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62109375, - "step": 157, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999230146408081 - }, - { - "episode": 2544, - "epoch": 0.015242477621598303, - "loss/policy_avg": 0.1439390629529953, - "lr": 9.899028629856852e-06, - "objective/entropy": 220.28952026367188, - "objective/kl": 19.178768157958984, - "objective/non_score_reward": -0.9589384198188782, - "objective/rlhf_reward": -3.8357537388801575, - "objective/scores": 0.0, - "policy/approxkl_avg": 71.46617126464844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8828125, - "step": 158, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997718334197998 - }, - { - "episode": 2560, - "epoch": 0.015338342260727852, - "loss/policy_avg": 0.30983591079711914, - "lr": 9.898389570552149e-06, - "objective/entropy": 104.64752197265625, - "objective/kl": 27.657455444335938, - "objective/non_score_reward": -1.3828728199005127, - "objective/rlhf_reward": -3.7981575886408487, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 138.6593017578125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.560546875, - "step": 159, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977858066558838 - }, - { - "episode": 2576, - "epoch": 0.015434206899857401, - "loss/policy_avg": 0.07094208896160126, - "lr": 9.897750511247446e-06, - "objective/entropy": 90.89056396484375, - "objective/kl": 21.30394744873047, - "objective/non_score_reward": -1.065197467803955, - "objective/rlhf_reward": -4.260790050029755, - "objective/scores": 0.0, - "policy/approxkl_avg": 297.8140563964844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.80859375, - "step": 160, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0023467540740967 - }, - { - "episode": 2592, - "epoch": 0.01553007153898695, - "loss/policy_avg": 0.16327880322933197, - "lr": 9.89711145194274e-06, - "objective/entropy": 100.65301513671875, - "objective/kl": 24.78557586669922, - "objective/non_score_reward": -1.239278793334961, - "objective/rlhf_reward": -3.1322867229309788, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 208.9399871826172, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.625, - "step": 161, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0041985511779785 - }, - { - "episode": 2608, - "epoch": 0.0156259361781165, - "loss/policy_avg": 0.216099351644516, - "lr": 9.896472392638038e-06, - "objective/entropy": 57.33685302734375, - "objective/kl": 29.304649353027344, - "objective/non_score_reward": -1.4652326107025146, - "objective/rlhf_reward": -7.8609299659729, - "objective/scores": -0.5, - "policy/approxkl_avg": 207.6790313720703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.833984375, - "step": 162, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9980382919311523 - }, - { - "episode": 2624, - "epoch": 0.01572180081724605, - "loss/policy_avg": 0.0953613817691803, - "lr": 9.895833333333334e-06, - "objective/entropy": -133.8697967529297, - "objective/kl": 16.73604965209961, - "objective/non_score_reward": -0.83680260181427, - "objective/rlhf_reward": -3.347210466861725, - "objective/scores": 0.0, - "policy/approxkl_avg": 145.06759643554688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.802734375, - "step": 163, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9988582134246826 - }, - { - "episode": 2640, - "epoch": 0.0158176654563756, - "loss/policy_avg": 0.42890581488609314, - "lr": 9.895194274028631e-06, - "objective/entropy": 180.38433837890625, - "objective/kl": 25.974315643310547, - "objective/non_score_reward": -1.2987157106399536, - "objective/rlhf_reward": -7.1948628425598145, - "objective/scores": -0.5, - "policy/approxkl_avg": 126.24812316894531, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.73046875, - "step": 164, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9987969398498535 - }, - { - "episode": 2656, - "epoch": 0.015913530095505148, - "loss/policy_avg": 0.1958284080028534, - "lr": 9.894555214723928e-06, - "objective/entropy": 174.94032287597656, - "objective/kl": 25.64311981201172, - "objective/non_score_reward": -1.282155990600586, - "objective/rlhf_reward": -3.786988249331146, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 86.50934600830078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.705078125, - "step": 165, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9944665431976318 - }, - { - "episode": 2672, - "epoch": 0.016009394734634697, - "loss/policy_avg": 0.3368389904499054, - "lr": 9.893916155419225e-06, - "objective/entropy": 10.10284423828125, - "objective/kl": 24.560279846191406, - "objective/non_score_reward": -1.2280139923095703, - "objective/rlhf_reward": -6.912055969238281, - "objective/scores": -0.5, - "policy/approxkl_avg": 80.86394500732422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.720703125, - "step": 166, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9986299276351929 - }, - { - "episode": 2688, - "epoch": 0.016105259373764245, - "loss/policy_avg": 0.11198948323726654, - "lr": 9.89327709611452e-06, - "objective/entropy": 161.62661743164062, - "objective/kl": 39.22645568847656, - "objective/non_score_reward": -1.9613227844238281, - "objective/rlhf_reward": -7.845290899276733, - "objective/scores": 0.0, - "policy/approxkl_avg": 164.2472381591797, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.841796875, - "step": 167, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000530958175659 - }, - { - "episode": 2704, - "epoch": 0.016201124012893794, - "loss/policy_avg": 0.19894596934318542, - "lr": 9.892638036809815e-06, - "objective/entropy": 137.1651153564453, - "objective/kl": 24.861934661865234, - "objective/non_score_reward": -1.2430968284606934, - "objective/rlhf_reward": -3.147558684619974, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 133.68283081054688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.66796875, - "step": 168, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997298240661621 - }, - { - "episode": 2720, - "epoch": 0.016296988652023343, - "loss/policy_avg": 0.12182526290416718, - "lr": 9.891998977505112e-06, - "objective/entropy": 18.006725311279297, - "objective/kl": 17.92361068725586, - "objective/non_score_reward": -0.8961805701255798, - "objective/rlhf_reward": -2.134124155254707, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 192.45278930664062, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.583984375, - "step": 169, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0006425380706787 - }, - { - "episode": 2736, - "epoch": 0.016392853291152892, - "loss/policy_avg": -0.012196972966194153, - "lr": 9.89135991820041e-06, - "objective/entropy": 115.16173553466797, - "objective/kl": 21.65146827697754, - "objective/non_score_reward": -1.082573413848877, - "objective/rlhf_reward": -2.5054651006785145, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 145.09487915039062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.71484375, - "step": 170, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9983795881271362 - }, - { - "episode": 2752, - "epoch": 0.01648871793028244, - "loss/policy_avg": 0.5326859951019287, - "lr": 9.890720858895706e-06, - "objective/entropy": 95.13655853271484, - "objective/kl": 24.00056266784668, - "objective/non_score_reward": -1.2000280618667603, - "objective/rlhf_reward": -4.800112426280975, - "objective/scores": 0.0, - "policy/approxkl_avg": 119.50138854980469, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.634765625, - "step": 171, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0006933212280273 - }, - { - "episode": 2768, - "epoch": 0.01658458256941199, - "loss/policy_avg": 0.3668867349624634, - "lr": 9.890081799591003e-06, - "objective/entropy": 132.36126708984375, - "objective/kl": 21.386262893676758, - "objective/non_score_reward": -1.0693132877349854, - "objective/rlhf_reward": -1.8772529125213624, - "objective/scores": 0.6, - "policy/approxkl_avg": 48.470794677734375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.666015625, - "step": 172, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9946337938308716 - }, - { - "episode": 2784, - "epoch": 0.01668044720854154, - "loss/policy_avg": 0.460104763507843, - "lr": 9.8894427402863e-06, - "objective/entropy": 129.8038330078125, - "objective/kl": 25.860858917236328, - "objective/non_score_reward": -1.2930430173873901, - "objective/rlhf_reward": -3.656400167735752, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 146.58050537109375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5625, - "step": 173, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9949872493743896 - }, - { - "episode": 2800, - "epoch": 0.016776311847671088, - "loss/policy_avg": 0.11980315297842026, - "lr": 9.888803680981595e-06, - "objective/entropy": 173.85202026367188, - "objective/kl": 23.159679412841797, - "objective/non_score_reward": -1.1579840183258057, - "objective/rlhf_reward": -6.631936073303223, - "objective/scores": -0.5, - "policy/approxkl_avg": 84.56037902832031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7734375, - "step": 174, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9972279071807861 - }, - { - "episode": 2816, - "epoch": 0.016872176486800637, - "loss/policy_avg": 0.060305699706077576, - "lr": 9.888164621676892e-06, - "objective/entropy": 90.63494110107422, - "objective/kl": 23.04631805419922, - "objective/non_score_reward": -1.1523159742355347, - "objective/rlhf_reward": -4.609263688325882, - "objective/scores": 0.0, - "policy/approxkl_avg": 204.8768310546875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.564453125, - "step": 175, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9992055892944336 - }, - { - "episode": 2832, - "epoch": 0.016968041125930186, - "loss/policy_avg": 0.5130124092102051, - "lr": 9.887525562372189e-06, - "objective/entropy": 66.25984191894531, - "objective/kl": 32.70683288574219, - "objective/non_score_reward": -1.635341763496399, - "objective/rlhf_reward": -6.541367173194885, - "objective/scores": 0.0, - "policy/approxkl_avg": 265.0827941894531, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.609375, - "step": 176, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.996222734451294 - }, - { - "episode": 2848, - "epoch": 0.017063905765059735, - "loss/policy_avg": 0.008577877655625343, - "lr": 9.886886503067486e-06, - "objective/entropy": -118.17359924316406, - "objective/kl": 21.02519989013672, - "objective/non_score_reward": -1.051259994506836, - "objective/rlhf_reward": -4.205039799213409, - "objective/scores": 0.0, - "policy/approxkl_avg": 260.4126892089844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.66015625, - "step": 177, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000894069671631 - }, - { - "episode": 2864, - "epoch": 0.017159770404189284, - "loss/policy_avg": 0.18238189816474915, - "lr": 9.886247443762783e-06, - "objective/entropy": 130.0546875, - "objective/kl": 32.371009826660156, - "objective/non_score_reward": -1.6185506582260132, - "objective/rlhf_reward": -8.474203109741211, - "objective/scores": -0.5, - "policy/approxkl_avg": 228.2266387939453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7890625, - "step": 178, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9942381381988525 - }, - { - "episode": 2880, - "epoch": 0.017255635043318833, - "loss/policy_avg": 0.18286140263080597, - "lr": 9.88560838445808e-06, - "objective/entropy": -10.639881134033203, - "objective/kl": 29.253890991210938, - "objective/non_score_reward": -1.462694525718689, - "objective/rlhf_reward": -7.850778102874756, - "objective/scores": -0.5, - "policy/approxkl_avg": 148.62832641601562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.798828125, - "step": 179, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9988160133361816 - }, - { - "episode": 2896, - "epoch": 0.017351499682448382, - "loss/policy_avg": 0.07891340553760529, - "lr": 9.884969325153375e-06, - "objective/entropy": -120.97007751464844, - "objective/kl": 21.97601890563965, - "objective/non_score_reward": -1.0988008975982666, - "objective/rlhf_reward": -2.2724974177041393, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 200.60455322265625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.533203125, - "step": 180, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000365972518921 - }, - { - "episode": 2912, - "epoch": 0.01744736432157793, - "loss/policy_avg": 0.06744587421417236, - "lr": 9.884330265848671e-06, - "objective/entropy": 73.97216796875, - "objective/kl": 19.66523551940918, - "objective/non_score_reward": -0.9832619428634644, - "objective/rlhf_reward": -5.933047771453857, - "objective/scores": -0.5, - "policy/approxkl_avg": 168.08172607421875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.603515625, - "step": 181, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.997870683670044 - }, - { - "episode": 2928, - "epoch": 0.01754322896070748, - "loss/policy_avg": 0.12424597889184952, - "lr": 9.883691206543968e-06, - "objective/entropy": 77.82262420654297, - "objective/kl": 21.0150146484375, - "objective/non_score_reward": -1.050750732421875, - "objective/rlhf_reward": 0.1969969511032108, - "objective/scores": 1.1, - "policy/approxkl_avg": 109.60333251953125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.693359375, - "step": 182, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.996559500694275 - }, - { - "episode": 2944, - "epoch": 0.01763909359983703, - "loss/policy_avg": 0.2606327533721924, - "lr": 9.883052147239265e-06, - "objective/entropy": 172.60293579101562, - "objective/kl": 29.473426818847656, - "objective/non_score_reward": -1.4736714363098145, - "objective/rlhf_reward": -5.894685626029968, - "objective/scores": 0.0, - "policy/approxkl_avg": 194.51976013183594, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.818359375, - "step": 183, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9962568283081055 - }, - { - "episode": 2960, - "epoch": 0.017734958238966578, - "loss/policy_avg": 0.10910254716873169, - "lr": 9.882413087934562e-06, - "objective/entropy": 240.20162963867188, - "objective/kl": 15.176373481750488, - "objective/non_score_reward": -0.7588187456130981, - "objective/rlhf_reward": 1.3647250771522526, - "objective/scores": 1.1, - "policy/approxkl_avg": 43.88645935058594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8203125, - "step": 184, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0006511211395264 - }, - { - "episode": 2976, - "epoch": 0.017830822878096127, - "loss/policy_avg": 0.5547807812690735, - "lr": 9.881774028629857e-06, - "objective/entropy": 85.18072509765625, - "objective/kl": 21.537092208862305, - "objective/non_score_reward": -1.0768545866012573, - "objective/rlhf_reward": -6.307418346405029, - "objective/scores": -0.5, - "policy/approxkl_avg": 161.64654541015625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.75390625, - "step": 185, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9996278285980225 - }, - { - "episode": 2992, - "epoch": 0.017926687517225676, - "loss/policy_avg": 0.3181283175945282, - "lr": 9.881134969325154e-06, - "objective/entropy": 129.28257751464844, - "objective/kl": 28.59075927734375, - "objective/non_score_reward": -1.4295378923416138, - "objective/rlhf_reward": -5.718151569366455, - "objective/scores": 0.0, - "policy/approxkl_avg": 53.536468505859375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.705078125, - "step": 186, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998319149017334 - }, - { - "episode": 3008, - "epoch": 0.018022552156355228, - "loss/policy_avg": 0.37268152832984924, - "lr": 9.880495910020451e-06, - "objective/entropy": 183.25308227539062, - "objective/kl": 29.692989349365234, - "objective/non_score_reward": -1.484649419784546, - "objective/rlhf_reward": -7.938598155975342, - "objective/scores": -0.5, - "policy/approxkl_avg": 91.56600952148438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.599609375, - "step": 187, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9983760118484497 - }, - { - "episode": 3024, - "epoch": 0.018118416795484777, - "loss/policy_avg": 0.7035294771194458, - "lr": 9.879856850715748e-06, - "objective/entropy": -141.20687866210938, - "objective/kl": 16.28227996826172, - "objective/non_score_reward": -0.8141138553619385, - "objective/rlhf_reward": -5.256455421447754, - "objective/scores": -0.5, - "policy/approxkl_avg": 39.17454528808594, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.912109375, - "step": 188, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000697612762451 - }, - { - "episode": 3040, - "epoch": 0.018214281434614326, - "loss/policy_avg": 0.23234151303768158, - "lr": 9.879217791411043e-06, - "objective/entropy": 41.34138107299805, - "objective/kl": 27.16008758544922, - "objective/non_score_reward": -1.3580043315887451, - "objective/rlhf_reward": -4.05341557511459, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 271.9233093261719, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.51171875, - "step": 189, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999565839767456 - }, - { - "episode": 3056, - "epoch": 0.018310146073743875, - "loss/policy_avg": 0.07211380451917648, - "lr": 9.87857873210634e-06, - "objective/entropy": 20.789365768432617, - "objective/kl": 19.183855056762695, - "objective/non_score_reward": -0.9591927528381348, - "objective/rlhf_reward": -5.836771011352539, - "objective/scores": -0.5, - "policy/approxkl_avg": 108.85220336914062, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6328125, - "step": 190, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0007762908935547 - }, - { - "episode": 3072, - "epoch": 0.018406010712873424, - "loss/policy_avg": 0.7360332012176514, - "lr": 9.877939672801637e-06, - "objective/entropy": 219.01002502441406, - "objective/kl": 30.353984832763672, - "objective/non_score_reward": -1.5176992416381836, - "objective/rlhf_reward": -4.692195155707699, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 60.275230407714844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.73046875, - "step": 191, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9968408346176147 - }, - { - "episode": 3088, - "epoch": 0.018501875352002973, - "loss/policy_avg": 0.7660672664642334, - "lr": 9.877300613496934e-06, - "objective/entropy": 192.5721435546875, - "objective/kl": 18.974138259887695, - "objective/non_score_reward": -0.948706865310669, - "objective/rlhf_reward": -5.794827461242676, - "objective/scores": -0.5, - "policy/approxkl_avg": 208.0977783203125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.568359375, - "step": 192, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.996788740158081 - }, - { - "episode": 3104, - "epoch": 0.018597739991132522, - "loss/policy_avg": 0.4530583620071411, - "lr": 9.876661554192229e-06, - "objective/entropy": 185.2235107421875, - "objective/kl": 24.102296829223633, - "objective/non_score_reward": -1.2051149606704712, - "objective/rlhf_reward": -0.4204598426818844, - "objective/scores": 1.1, - "policy/approxkl_avg": 34.94757080078125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.728515625, - "step": 193, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9967865943908691 - }, - { - "episode": 3120, - "epoch": 0.01869360463026207, - "loss/policy_avg": 0.03487266227602959, - "lr": 9.876022494887526e-06, - "objective/entropy": 183.6469268798828, - "objective/kl": 20.133817672729492, - "objective/non_score_reward": -1.0066908597946167, - "objective/rlhf_reward": -4.026763558387756, - "objective/scores": 0.0, - "policy/approxkl_avg": 168.301025390625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7578125, - "step": 194, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0027122497558594 - }, - { - "episode": 3136, - "epoch": 0.01878946926939162, - "loss/policy_avg": -0.029073666781187057, - "lr": 9.875383435582823e-06, - "objective/entropy": 138.25656127929688, - "objective/kl": 18.322010040283203, - "objective/non_score_reward": -0.9161005020141602, - "objective/rlhf_reward": -5.664402008056641, - "objective/scores": -0.5, - "policy/approxkl_avg": 60.4761848449707, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.673828125, - "step": 195, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9991329908370972 - }, - { - "episode": 3152, - "epoch": 0.01888533390852117, - "loss/policy_avg": 0.14693962037563324, - "lr": 9.87474437627812e-06, - "objective/entropy": 71.7930908203125, - "objective/kl": 19.49433135986328, - "objective/non_score_reward": -0.9747166633605957, - "objective/rlhf_reward": -3.8988667130470276, - "objective/scores": 0.0, - "policy/approxkl_avg": 126.81082153320312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.767578125, - "step": 196, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977989196777344 - }, - { - "episode": 3168, - "epoch": 0.018981198547650718, - "loss/policy_avg": 0.6557031869888306, - "lr": 9.874105316973416e-06, - "objective/entropy": -9.503684997558594, - "objective/kl": 21.540775299072266, - "objective/non_score_reward": -1.0770388841629028, - "objective/rlhf_reward": -2.9295533085740626, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 100.91127014160156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.619140625, - "step": 197, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9948209524154663 - }, - { - "episode": 3184, - "epoch": 0.019077063186780267, - "loss/policy_avg": 0.23461255431175232, - "lr": 9.873466257668712e-06, - "objective/entropy": -49.81024932861328, - "objective/kl": 20.112146377563477, - "objective/non_score_reward": -1.0056073665618896, - "objective/rlhf_reward": -6.0224289894104, - "objective/scores": -0.5, - "policy/approxkl_avg": 135.8631134033203, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.58203125, - "step": 198, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999420404434204 - }, - { - "episode": 3200, - "epoch": 0.019172927825909816, - "loss/policy_avg": -0.5007312297821045, - "lr": 9.872827198364009e-06, - "objective/entropy": -25.718414306640625, - "objective/kl": 15.317103385925293, - "objective/non_score_reward": -0.7658551931381226, - "objective/rlhf_reward": -5.06342077255249, - "objective/scores": -0.5, - "policy/approxkl_avg": 154.1348876953125, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.796875, - "step": 199, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.01686954498291 - }, - { - "episode": 3216, - "epoch": 0.019268792465039365, - "loss/policy_avg": -0.08025580644607544, - "lr": 9.872188139059305e-06, - "objective/entropy": 205.03077697753906, - "objective/kl": 21.50469207763672, - "objective/non_score_reward": -1.0752345323562622, - "objective/rlhf_reward": 0.09906184077262914, - "objective/scores": 1.1, - "policy/approxkl_avg": 16.594398498535156, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.67578125, - "step": 200, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998137354850769 - }, - { - "episode": 3232, - "epoch": 0.019364657104168913, - "loss/policy_avg": 0.09840521216392517, - "lr": 9.871549079754602e-06, - "objective/entropy": 94.7195053100586, - "objective/kl": 23.290691375732422, - "objective/non_score_reward": -1.164534568786621, - "objective/rlhf_reward": -6.658138275146484, - "objective/scores": -0.5, - "policy/approxkl_avg": 61.80426788330078, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.568359375, - "step": 201, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993457794189453 - }, - { - "episode": 3248, - "epoch": 0.019460521743298462, - "loss/policy_avg": 0.20220378041267395, - "lr": 9.8709100204499e-06, - "objective/entropy": 138.21493530273438, - "objective/kl": 26.57837677001953, - "objective/non_score_reward": -1.328918695449829, - "objective/rlhf_reward": -7.315675258636475, - "objective/scores": -0.5, - "policy/approxkl_avg": 125.10806274414062, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.671875, - "step": 202, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9978711605072021 - }, - { - "episode": 3264, - "epoch": 0.01955638638242801, - "loss/policy_avg": 0.20998699963092804, - "lr": 9.870270961145196e-06, - "objective/entropy": 183.40069580078125, - "objective/kl": 22.997217178344727, - "objective/non_score_reward": -1.1498608589172363, - "objective/rlhf_reward": -4.599443554878235, - "objective/scores": 0.0, - "policy/approxkl_avg": 99.0360336303711, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.599609375, - "step": 203, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988300800323486 - }, - { - "episode": 3280, - "epoch": 0.01965225102155756, - "loss/policy_avg": 0.11741530895233154, - "lr": 9.869631901840491e-06, - "objective/entropy": 10.714279174804688, - "objective/kl": 29.581377029418945, - "objective/non_score_reward": -1.4790689945220947, - "objective/rlhf_reward": -4.435322943146586, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 252.8714599609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.76953125, - "step": 204, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9974395036697388 - }, - { - "episode": 3296, - "epoch": 0.01974811566068711, - "loss/policy_avg": 0.28326037526130676, - "lr": 9.868992842535788e-06, - "objective/entropy": 115.94627380371094, - "objective/kl": 29.00347137451172, - "objective/non_score_reward": -1.4501736164093018, - "objective/rlhf_reward": -5.800694525241852, - "objective/scores": 0.0, - "policy/approxkl_avg": 219.1302032470703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.853515625, - "step": 205, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9988524913787842 - }, - { - "episode": 3312, - "epoch": 0.019843980299816658, - "loss/policy_avg": 0.11076626181602478, - "lr": 9.868353783231085e-06, - "objective/entropy": 146.21900939941406, - "objective/kl": 33.56465530395508, - "objective/non_score_reward": -1.6782327890396118, - "objective/rlhf_reward": -6.712931394577026, - "objective/scores": 0.0, - "policy/approxkl_avg": 157.33514404296875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.908203125, - "step": 206, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998185157775879 - }, - { - "episode": 3328, - "epoch": 0.019939844938946207, - "loss/policy_avg": -0.04051626846194267, - "lr": 9.867714723926382e-06, - "objective/entropy": 80.3193130493164, - "objective/kl": 31.786861419677734, - "objective/non_score_reward": -1.5893430709838867, - "objective/rlhf_reward": -3.433653388859007, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 220.55699157714844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.52734375, - "step": 207, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982627630233765 - }, - { - "episode": 3344, - "epoch": 0.020035709578075756, - "loss/policy_avg": 0.0633954256772995, - "lr": 9.867075664621679e-06, - "objective/entropy": 167.83624267578125, - "objective/kl": 28.54816246032715, - "objective/non_score_reward": -1.427408218383789, - "objective/rlhf_reward": -7.709632873535156, - "objective/scores": -0.5, - "policy/approxkl_avg": 161.32330322265625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.77734375, - "step": 208, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0011916160583496 - }, - { - "episode": 3360, - "epoch": 0.020131574217205305, - "loss/policy_avg": 0.4134795665740967, - "lr": 9.866436605316974e-06, - "objective/entropy": 190.93576049804688, - "objective/kl": 31.057029724121094, - "objective/non_score_reward": -1.5528514385223389, - "objective/rlhf_reward": -4.852156007026119, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 412.91265869140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.833984375, - "step": 209, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9969439506530762 - }, - { - "episode": 3376, - "epoch": 0.020227438856334854, - "loss/policy_avg": 0.011405892670154572, - "lr": 9.86579754601227e-06, - "objective/entropy": -0.3257408142089844, - "objective/kl": 30.285137176513672, - "objective/non_score_reward": -1.5142569541931152, - "objective/rlhf_reward": -6.057027459144592, - "objective/scores": 0.0, - "policy/approxkl_avg": 255.63124084472656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.716796875, - "step": 210, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983439445495605 - }, - { - "episode": 3392, - "epoch": 0.020323303495464403, - "loss/policy_avg": 0.02998751401901245, - "lr": 9.865158486707568e-06, - "objective/entropy": 18.811004638671875, - "objective/kl": 26.8281307220459, - "objective/non_score_reward": -1.3414065837860107, - "objective/rlhf_reward": -2.4419072612535686, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 244.22311401367188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.79296875, - "step": 211, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999415636062622 - }, - { - "episode": 3408, - "epoch": 0.020419168134593952, - "loss/policy_avg": 0.3882741332054138, - "lr": 9.864519427402863e-06, - "objective/entropy": 151.1260223388672, - "objective/kl": 34.16276550292969, - "objective/non_score_reward": -1.708138108253479, - "objective/rlhf_reward": -5.170693164289581, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 124.56742858886719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.654296875, - "step": 212, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9970864057540894 - }, - { - "episode": 3424, - "epoch": 0.0205150327737235, - "loss/policy_avg": 0.2528703212738037, - "lr": 9.86388036809816e-06, - "objective/entropy": 71.80561065673828, - "objective/kl": 30.099973678588867, - "objective/non_score_reward": -1.5049986839294434, - "objective/rlhf_reward": -8.019994735717773, - "objective/scores": -0.5, - "policy/approxkl_avg": 310.709716796875, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.748046875, - "step": 213, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.997683048248291 - }, - { - "episode": 3440, - "epoch": 0.02061089741285305, - "loss/policy_avg": -0.011442364193499088, - "lr": 9.863241308793457e-06, - "objective/entropy": 148.97042846679688, - "objective/kl": 25.652734756469727, - "objective/non_score_reward": -1.2826368808746338, - "objective/rlhf_reward": -2.7305472850799557, - "objective/scores": 0.6, - "policy/approxkl_avg": 60.726966857910156, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.640625, - "step": 214, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99959135055542 - }, - { - "episode": 3456, - "epoch": 0.0207067620519826, - "loss/policy_avg": 0.30335062742233276, - "lr": 9.862602249488753e-06, - "objective/entropy": 64.34867858886719, - "objective/kl": 26.22498321533203, - "objective/non_score_reward": -1.3112492561340332, - "objective/rlhf_reward": -7.244997024536133, - "objective/scores": -0.5, - "policy/approxkl_avg": 180.33152770996094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.716796875, - "step": 215, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99501633644104 - }, - { - "episode": 3472, - "epoch": 0.02080262669111215, - "loss/policy_avg": 0.2380252331495285, - "lr": 9.86196319018405e-06, - "objective/entropy": -28.018264770507812, - "objective/kl": 25.426055908203125, - "objective/non_score_reward": -1.2713027000427246, - "objective/rlhf_reward": -5.0852110385894775, - "objective/scores": 0.0, - "policy/approxkl_avg": 168.28411865234375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.623046875, - "step": 216, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.996917486190796 - }, - { - "episode": 3488, - "epoch": 0.0208984913302417, - "loss/policy_avg": 0.35980474948883057, - "lr": 9.861324130879346e-06, - "objective/entropy": -56.049781799316406, - "objective/kl": 18.423480987548828, - "objective/non_score_reward": -0.9211740493774414, - "objective/rlhf_reward": -3.6846961677074432, - "objective/scores": 0.0, - "policy/approxkl_avg": 23.263450622558594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.619140625, - "step": 217, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9986913204193115 - }, - { - "episode": 3504, - "epoch": 0.02099435596937125, - "loss/policy_avg": 0.11377542465925217, - "lr": 9.860685071574642e-06, - "objective/entropy": 51.95567321777344, - "objective/kl": 30.016387939453125, - "objective/non_score_reward": -1.500819444656372, - "objective/rlhf_reward": -8.003277778625488, - "objective/scores": -0.5, - "policy/approxkl_avg": 219.76956176757812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.673828125, - "step": 218, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001506805419922 - }, - { - "episode": 3520, - "epoch": 0.021090220608500798, - "loss/policy_avg": 0.11929692327976227, - "lr": 9.86004601226994e-06, - "objective/entropy": 54.36243438720703, - "objective/kl": 24.446704864501953, - "objective/non_score_reward": -1.2223353385925293, - "objective/rlhf_reward": -4.8893409967422485, - "objective/scores": 0.0, - "policy/approxkl_avg": 19.67999267578125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.587890625, - "step": 219, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0119833946228027 - }, - { - "episode": 3536, - "epoch": 0.021186085247630347, - "loss/policy_avg": 1.1937235593795776, - "lr": 9.859406952965236e-06, - "objective/entropy": -8.128410339355469, - "objective/kl": 29.450044631958008, - "objective/non_score_reward": -1.4725021123886108, - "objective/rlhf_reward": -4.065179969343255, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 190.29078674316406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625, - "step": 220, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001643180847168 - }, - { - "episode": 3552, - "epoch": 0.021281949886759896, - "loss/policy_avg": 0.48739299178123474, - "lr": 9.858767893660533e-06, - "objective/entropy": 96.00523376464844, - "objective/kl": 25.66995620727539, - "objective/non_score_reward": -1.2834978103637695, - "objective/rlhf_reward": -3.8084781503974625, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 83.90371704101562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.44921875, - "step": 221, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984304904937744 - }, - { - "episode": 3568, - "epoch": 0.021377814525889445, - "loss/policy_avg": 0.4375818967819214, - "lr": 9.858128834355828e-06, - "objective/entropy": 75.25170135498047, - "objective/kl": 31.37661361694336, - "objective/non_score_reward": -1.5688308477401733, - "objective/rlhf_reward": -4.949810538321657, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 113.29835510253906, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.515625, - "step": 222, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9974620342254639 - }, - { - "episode": 3584, - "epoch": 0.021473679165018994, - "loss/policy_avg": 0.18842488527297974, - "lr": 9.857489775051125e-06, - "objective/entropy": 62.053443908691406, - "objective/kl": 30.84737777709961, - "objective/non_score_reward": -1.5423686504364014, - "objective/rlhf_reward": -6.16947478055954, - "objective/scores": 0.0, - "policy/approxkl_avg": 170.18569946289062, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.58984375, - "step": 223, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998410701751709 - }, - { - "episode": 3600, - "epoch": 0.021569543804148543, - "loss/policy_avg": 0.3384511470794678, - "lr": 9.856850715746422e-06, - "objective/entropy": 80.18526458740234, - "objective/kl": 23.6530818939209, - "objective/non_score_reward": -1.1826542615890503, - "objective/rlhf_reward": -4.730616986751556, - "objective/scores": 0.0, - "policy/approxkl_avg": 3.6079671382904053, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.57421875, - "step": 224, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994169473648071 - }, - { - "episode": 3616, - "epoch": 0.021665408443278092, - "loss/policy_avg": 0.29894721508026123, - "lr": 9.856211656441719e-06, - "objective/entropy": 165.5200958251953, - "objective/kl": 18.515085220336914, - "objective/non_score_reward": -0.9257543087005615, - "objective/rlhf_reward": -3.703017294406891, - "objective/scores": 0.0, - "policy/approxkl_avg": 74.32862854003906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.70703125, - "step": 225, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9962903261184692 - }, - { - "episode": 3632, - "epoch": 0.02176127308240764, - "loss/policy_avg": 0.46412864327430725, - "lr": 9.855572597137016e-06, - "objective/entropy": 89.5113525390625, - "objective/kl": 29.43311882019043, - "objective/non_score_reward": -1.4716558456420898, - "objective/rlhf_reward": -7.886623382568359, - "objective/scores": -0.5, - "policy/approxkl_avg": 100.8701171875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.69140625, - "step": 226, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9992918968200684 - }, - { - "episode": 3648, - "epoch": 0.02185713772153719, - "loss/policy_avg": -0.17664140462875366, - "lr": 9.854933537832313e-06, - "objective/entropy": -5.129295349121094, - "objective/kl": 16.053768157958984, - "objective/non_score_reward": -0.802688479423523, - "objective/rlhf_reward": -5.210753917694092, - "objective/scores": -0.5, - "policy/approxkl_avg": 10.066779136657715, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.525390625, - "step": 227, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001509189605713 - }, - { - "episode": 3664, - "epoch": 0.02195300236066674, - "loss/policy_avg": 0.23512879014015198, - "lr": 9.854294478527608e-06, - "objective/entropy": 236.32894897460938, - "objective/kl": 31.84699058532715, - "objective/non_score_reward": -1.5923495292663574, - "objective/rlhf_reward": -8.36939811706543, - "objective/scores": -0.5, - "policy/approxkl_avg": 100.35233306884766, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.73046875, - "step": 228, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9997907876968384 - }, - { - "episode": 3680, - "epoch": 0.022048866999796288, - "loss/policy_avg": -0.021687505766749382, - "lr": 9.853655419222905e-06, - "objective/entropy": 156.21514892578125, - "objective/kl": 33.45891189575195, - "objective/non_score_reward": -1.6729457378387451, - "objective/rlhf_reward": -6.691782712936401, - "objective/scores": 0.0, - "policy/approxkl_avg": 59.40328598022461, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.69140625, - "step": 229, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9973727464675903 - }, - { - "episode": 3696, - "epoch": 0.022144731638925837, - "loss/policy_avg": 0.2921329736709595, - "lr": 9.853016359918202e-06, - "objective/entropy": 283.9156494140625, - "objective/kl": 28.28559112548828, - "objective/non_score_reward": -1.4142796993255615, - "objective/rlhf_reward": -5.657118558883667, - "objective/scores": 0.0, - "policy/approxkl_avg": 67.94718933105469, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.876953125, - "step": 230, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9960227012634277 - }, - { - "episode": 3712, - "epoch": 0.022240596278055386, - "loss/policy_avg": 0.05423973500728607, - "lr": 9.852377300613498e-06, - "objective/entropy": 138.2334442138672, - "objective/kl": 43.57646560668945, - "objective/non_score_reward": -2.1788229942321777, - "objective/rlhf_reward": -7.159033386912897, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 98.089111328125, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6875, - "step": 231, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993014335632324 - }, - { - "episode": 3728, - "epoch": 0.022336460917184935, - "loss/policy_avg": 0.19056108593940735, - "lr": 9.851738241308795e-06, - "objective/entropy": 10.413976669311523, - "objective/kl": 18.887348175048828, - "objective/non_score_reward": -0.9443674087524414, - "objective/rlhf_reward": -3.777469515800476, - "objective/scores": 0.0, - "policy/approxkl_avg": 105.73004150390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.59765625, - "step": 232, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9962282180786133 - }, - { - "episode": 3744, - "epoch": 0.022432325556314484, - "loss/policy_avg": 0.4601524770259857, - "lr": 9.85109918200409e-06, - "objective/entropy": -74.81282043457031, - "objective/kl": 24.299213409423828, - "objective/non_score_reward": -1.2149605751037598, - "objective/rlhf_reward": -4.859842598438263, - "objective/scores": 0.0, - "policy/approxkl_avg": 34.33855056762695, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.638671875, - "step": 233, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0012383460998535 - }, - { - "episode": 3760, - "epoch": 0.022528190195444033, - "loss/policy_avg": 0.40300655364990234, - "lr": 9.850460122699387e-06, - "objective/entropy": -45.57096481323242, - "objective/kl": 30.75171661376953, - "objective/non_score_reward": -1.537585973739624, - "objective/rlhf_reward": -8.15034294128418, - "objective/scores": -0.5, - "policy/approxkl_avg": 57.83643341064453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.791015625, - "step": 234, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9980592727661133 - }, - { - "episode": 3776, - "epoch": 0.02262405483457358, - "loss/policy_avg": 0.08251257985830307, - "lr": 9.849821063394683e-06, - "objective/entropy": -39.57012176513672, - "objective/kl": 21.46126937866211, - "objective/non_score_reward": -1.0730634927749634, - "objective/rlhf_reward": -2.9667412376701066, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 8.371784210205078, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.591796875, - "step": 235, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99739670753479 - }, - { - "episode": 3792, - "epoch": 0.02271991947370313, - "loss/policy_avg": 0.287686824798584, - "lr": 9.84918200408998e-06, - "objective/entropy": 225.470703125, - "objective/kl": 32.462642669677734, - "objective/non_score_reward": -1.6231322288513184, - "objective/rlhf_reward": -5.092528736591339, - "objective/scores": 0.35, - "policy/approxkl_avg": 233.72256469726562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.685546875, - "step": 236, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9972712993621826 - }, - { - "episode": 3808, - "epoch": 0.02281578411283268, - "loss/policy_avg": 0.40615230798721313, - "lr": 9.848542944785276e-06, - "objective/entropy": 138.72113037109375, - "objective/kl": 36.74411392211914, - "objective/non_score_reward": -1.8372057676315308, - "objective/rlhf_reward": -7.348823070526123, - "objective/scores": 0.0, - "policy/approxkl_avg": 68.71369934082031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.51171875, - "step": 237, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0015788078308105 - }, - { - "episode": 3824, - "epoch": 0.02291164875196223, - "loss/policy_avg": 0.2618522644042969, - "lr": 9.847903885480573e-06, - "objective/entropy": 218.24368286132812, - "objective/kl": 28.746074676513672, - "objective/non_score_reward": -1.4373037815093994, - "objective/rlhf_reward": -4.145094964567738, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 35.09134292602539, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65234375, - "step": 238, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9968454837799072 - }, - { - "episode": 3840, - "epoch": 0.023007513391091777, - "loss/policy_avg": 0.44723182916641235, - "lr": 9.84726482617587e-06, - "objective/entropy": 134.8599853515625, - "objective/kl": 30.93494415283203, - "objective/non_score_reward": -1.5467472076416016, - "objective/rlhf_reward": -4.630729882922724, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 72.10969543457031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.666015625, - "step": 239, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997701644897461 - }, - { - "episode": 3856, - "epoch": 0.023103378030221326, - "loss/policy_avg": 1.2493870258331299, - "lr": 9.846625766871167e-06, - "objective/entropy": 101.4572525024414, - "objective/kl": 29.70997428894043, - "objective/non_score_reward": -1.4854986667633057, - "objective/rlhf_reward": -5.941995084285736, - "objective/scores": 0.0, - "policy/approxkl_avg": 100.62832641601562, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.517578125, - "step": 240, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9961397647857666 - }, - { - "episode": 3872, - "epoch": 0.023199242669350875, - "loss/policy_avg": 0.11165004968643188, - "lr": 9.845986707566462e-06, - "objective/entropy": -147.07489013671875, - "objective/kl": 25.88149642944336, - "objective/non_score_reward": -1.2940750122070312, - "objective/rlhf_reward": -7.176300048828125, - "objective/scores": -0.5, - "policy/approxkl_avg": 239.4694061279297, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.705078125, - "step": 241, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998302698135376 - }, - { - "episode": 3888, - "epoch": 0.023295107308480424, - "loss/policy_avg": 0.24919648468494415, - "lr": 9.845347648261759e-06, - "objective/entropy": -68.71859741210938, - "objective/kl": 27.074668884277344, - "objective/non_score_reward": -1.3537335395812988, - "objective/rlhf_reward": -7.414934158325195, - "objective/scores": -0.5, - "policy/approxkl_avg": 81.3831787109375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.443359375, - "step": 242, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9985148906707764 - }, - { - "episode": 3904, - "epoch": 0.023390971947609973, - "loss/policy_avg": 0.26694488525390625, - "lr": 9.844708588957056e-06, - "objective/entropy": 94.072265625, - "objective/kl": 28.564594268798828, - "objective/non_score_reward": -1.428229808807373, - "objective/rlhf_reward": -7.712919235229492, - "objective/scores": -0.5, - "policy/approxkl_avg": 219.86279296875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8515625, - "step": 243, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9972708225250244 - }, - { - "episode": 3920, - "epoch": 0.023486836586739522, - "loss/policy_avg": 0.6054710149765015, - "lr": 9.844069529652353e-06, - "objective/entropy": 129.7139892578125, - "objective/kl": 30.57644271850586, - "objective/non_score_reward": -1.5288220643997192, - "objective/rlhf_reward": -8.115287780761719, - "objective/scores": -0.5, - "policy/approxkl_avg": 83.28874206542969, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.712890625, - "step": 244, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9941790103912354 - }, - { - "episode": 3936, - "epoch": 0.023582701225869074, - "loss/policy_avg": 0.12586408853530884, - "lr": 9.84343047034765e-06, - "objective/entropy": 241.99429321289062, - "objective/kl": 27.60189437866211, - "objective/non_score_reward": -1.3800947666168213, - "objective/rlhf_reward": -5.520378828048706, - "objective/scores": 0.0, - "policy/approxkl_avg": 73.93728637695312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.83984375, - "step": 245, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9990196228027344 - }, - { - "episode": 3952, - "epoch": 0.023678565864998623, - "loss/policy_avg": 0.4687037467956543, - "lr": 9.842791411042945e-06, - "objective/entropy": 90.39759826660156, - "objective/kl": 26.80643081665039, - "objective/non_score_reward": -1.3403215408325195, - "objective/rlhf_reward": -7.361286163330078, - "objective/scores": -0.5, - "policy/approxkl_avg": 122.53802490234375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.9375, - "step": 246, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.99778151512146 - }, - { - "episode": 3968, - "epoch": 0.023774430504128172, - "loss/policy_avg": 0.7288471460342407, - "lr": 9.842152351738242e-06, - "objective/entropy": 111.89739227294922, - "objective/kl": 26.556848526000977, - "objective/non_score_reward": -1.3278424739837646, - "objective/rlhf_reward": -5.311370104551315, - "objective/scores": 0.0, - "policy/approxkl_avg": 153.5714874267578, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.58984375, - "step": 247, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0046982765197754 - }, - { - "episode": 3984, - "epoch": 0.02387029514325772, - "loss/policy_avg": 0.4886789321899414, - "lr": 9.841513292433539e-06, - "objective/entropy": 143.58645629882812, - "objective/kl": 23.396575927734375, - "objective/non_score_reward": -1.1698288917541504, - "objective/rlhf_reward": -2.854486520561289, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 99.15058898925781, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.45703125, - "step": 248, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9984853267669678 - }, - { - "episode": 4000, - "epoch": 0.02396615978238727, - "loss/policy_avg": 0.20749720931053162, - "lr": 9.840874233128836e-06, - "objective/entropy": 34.73357391357422, - "objective/kl": 20.36009979248047, - "objective/non_score_reward": -1.0180050134658813, - "objective/rlhf_reward": -4.07202011346817, - "objective/scores": 0.0, - "policy/approxkl_avg": 159.967529296875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.580078125, - "step": 249, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0006356239318848 - }, - { - "episode": 4016, - "epoch": 0.02406202442151682, - "loss/policy_avg": 0.3030295670032501, - "lr": 9.840235173824132e-06, - "objective/entropy": -112.51934814453125, - "objective/kl": 33.881568908691406, - "objective/non_score_reward": -1.6940785646438599, - "objective/rlhf_reward": -4.951485450538705, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 146.98245239257812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.65234375, - "step": 250, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9988691806793213 - }, - { - "episode": 4032, - "epoch": 0.024157889060646368, - "loss/policy_avg": 0.6671163439750671, - "lr": 9.83959611451943e-06, - "objective/entropy": -65.78260803222656, - "objective/kl": 24.848434448242188, - "objective/non_score_reward": -1.2424218654632568, - "objective/rlhf_reward": -6.969687461853027, - "objective/scores": -0.5, - "policy/approxkl_avg": 93.62220764160156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.58203125, - "step": 251, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9968496561050415 - }, - { - "episode": 4048, - "epoch": 0.024253753699775917, - "loss/policy_avg": 0.1528814733028412, - "lr": 9.838957055214724e-06, - "objective/entropy": 203.98094177246094, - "objective/kl": 40.56060791015625, - "objective/non_score_reward": -2.0280303955078125, - "objective/rlhf_reward": -10.11212158203125, - "objective/scores": -0.5, - "policy/approxkl_avg": 216.96200561523438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.935546875, - "step": 252, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9983346462249756 - }, - { - "episode": 4064, - "epoch": 0.024349618338905466, - "loss/policy_avg": 0.1612689346075058, - "lr": 9.838317995910021e-06, - "objective/entropy": 199.67910766601562, - "objective/kl": 17.4683780670166, - "objective/non_score_reward": -0.873418927192688, - "objective/rlhf_reward": 0.9063242912292484, - "objective/scores": 1.1, - "policy/approxkl_avg": 40.0599365234375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.685546875, - "step": 253, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0007472038269043 - }, - { - "episode": 4080, - "epoch": 0.024445482978035015, - "loss/policy_avg": 0.1594327837228775, - "lr": 9.837678936605318e-06, - "objective/entropy": 272.66253662109375, - "objective/kl": 22.29098129272461, - "objective/non_score_reward": -1.114549160003662, - "objective/rlhf_reward": -3.058196461200714, - "objective/scores": 0.35, - "policy/approxkl_avg": 11.639923095703125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8359375, - "step": 254, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9991519451141357 - }, - { - "episode": 4096, - "epoch": 0.024541347617164564, - "loss/policy_avg": 0.08180014789104462, - "lr": 9.837039877300615e-06, - "objective/entropy": 67.96817779541016, - "objective/kl": 19.31763458251953, - "objective/non_score_reward": -0.9658817052841187, - "objective/rlhf_reward": -5.863526821136475, - "objective/scores": -0.5, - "policy/approxkl_avg": 9.514694213867188, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.556640625, - "step": 255, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9998260736465454 - }, - { - "episode": 4112, - "epoch": 0.024637212256294113, - "loss/policy_avg": 0.05669542774558067, - "lr": 9.83640081799591e-06, - "objective/entropy": 74.68458557128906, - "objective/kl": 24.917343139648438, - "objective/non_score_reward": -1.2458672523498535, - "objective/rlhf_reward": -3.158640022548746, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 20.251989364624023, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.60546875, - "step": 256, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0000245571136475 - }, - { - "episode": 4128, - "epoch": 0.024733076895423662, - "loss/policy_avg": -0.04459148645401001, - "lr": 9.835761758691207e-06, - "objective/entropy": 44.43208312988281, - "objective/kl": 26.832380294799805, - "objective/non_score_reward": -1.3416190147399902, - "objective/rlhf_reward": -3.9878734732545436, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 121.80049896240234, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.61328125, - "step": 257, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0049991607666016 - }, - { - "episode": 4144, - "epoch": 0.02482894153455321, - "loss/policy_avg": 0.2643028497695923, - "lr": 9.835122699386504e-06, - "objective/entropy": -58.66691207885742, - "objective/kl": 19.387344360351562, - "objective/non_score_reward": -0.969367265701294, - "objective/rlhf_reward": -5.877469062805176, - "objective/scores": -0.5, - "policy/approxkl_avg": 45.993797302246094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5859375, - "step": 258, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9965884685516357 - }, - { - "episode": 4160, - "epoch": 0.02492480617368276, - "loss/policy_avg": 0.6535857915878296, - "lr": 9.8344836400818e-06, - "objective/entropy": 142.896484375, - "objective/kl": 22.385143280029297, - "objective/non_score_reward": -1.1192572116851807, - "objective/rlhf_reward": -6.477028846740723, - "objective/scores": -0.5, - "policy/approxkl_avg": 159.0670166015625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.90625, - "step": 259, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975087642669678 - }, - { - "episode": 4176, - "epoch": 0.02502067081281231, - "loss/policy_avg": 0.06705514341592789, - "lr": 9.833844580777096e-06, - "objective/entropy": 85.35755920410156, - "objective/kl": 42.735443115234375, - "objective/non_score_reward": -2.1367719173431396, - "objective/rlhf_reward": -6.8852281622296445, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 59.29423522949219, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.513671875, - "step": 260, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998096227645874 - }, - { - "episode": 4192, - "epoch": 0.025116535451941858, - "loss/policy_avg": 0.16437175869941711, - "lr": 9.833205521472393e-06, - "objective/entropy": 238.88308715820312, - "objective/kl": 35.51251983642578, - "objective/non_score_reward": -1.7756261825561523, - "objective/rlhf_reward": -9.10250473022461, - "objective/scores": -0.5, - "policy/approxkl_avg": 87.11653137207031, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.75390625, - "step": 261, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9962289333343506 - }, - { - "episode": 4208, - "epoch": 0.025212400091071407, - "loss/policy_avg": 0.2615561783313751, - "lr": 9.83256646216769e-06, - "objective/entropy": 165.4353790283203, - "objective/kl": 31.884780883789062, - "objective/non_score_reward": -1.5942389965057373, - "objective/rlhf_reward": -4.772836122576313, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 145.626708984375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.689453125, - "step": 262, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9968907833099365 - }, - { - "episode": 4224, - "epoch": 0.025308264730200956, - "loss/policy_avg": 0.1498258411884308, - "lr": 9.831927402862987e-06, - "objective/entropy": 114.43228149414062, - "objective/kl": 34.159423828125, - "objective/non_score_reward": -1.7079713344573975, - "objective/rlhf_reward": -8.831884384155273, - "objective/scores": -0.5, - "policy/approxkl_avg": 120.08966827392578, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6640625, - "step": 263, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.995566964149475 - }, - { - "episode": 4240, - "epoch": 0.025404129369330505, - "loss/policy_avg": 0.9565318822860718, - "lr": 9.831288343558284e-06, - "objective/entropy": 30.085983276367188, - "objective/kl": 30.190189361572266, - "objective/non_score_reward": -1.5095094442367554, - "objective/rlhf_reward": -6.038037717342377, - "objective/scores": 0.0, - "policy/approxkl_avg": 30.665681838989258, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.548828125, - "step": 264, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984924793243408 - }, - { - "episode": 4256, - "epoch": 0.025499994008460054, - "loss/policy_avg": 0.1617015153169632, - "lr": 9.830649284253579e-06, - "objective/entropy": 85.59569549560547, - "objective/kl": 29.51198959350586, - "objective/non_score_reward": -1.4755992889404297, - "objective/rlhf_reward": -1.5023974239826199, - "objective/scores": 1.1, - "policy/approxkl_avg": 238.41380310058594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.62109375, - "step": 265, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998697280883789 - }, - { - "episode": 4272, - "epoch": 0.025595858647589603, - "loss/policy_avg": 0.12634433805942535, - "lr": 9.830010224948876e-06, - "objective/entropy": -17.845001220703125, - "objective/kl": 23.098552703857422, - "objective/non_score_reward": -1.1549276113510132, - "objective/rlhf_reward": -6.619710445404053, - "objective/scores": -0.5, - "policy/approxkl_avg": 43.69245147705078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.564453125, - "step": 266, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993417263031006 - }, - { - "episode": 4288, - "epoch": 0.02569172328671915, - "loss/policy_avg": 0.1535305678844452, - "lr": 9.829371165644173e-06, - "objective/entropy": 149.2012481689453, - "objective/kl": 30.30670928955078, - "objective/non_score_reward": -1.5153354406356812, - "objective/rlhf_reward": -4.545570218356785, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 55.40291976928711, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.517578125, - "step": 267, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999010682106018 - }, - { - "episode": 4304, - "epoch": 0.0257875879258487, - "loss/policy_avg": 0.3865639567375183, - "lr": 9.82873210633947e-06, - "objective/entropy": 96.54017639160156, - "objective/kl": 32.002784729003906, - "objective/non_score_reward": -1.6001390218734741, - "objective/rlhf_reward": -8.400556564331055, - "objective/scores": -0.5, - "policy/approxkl_avg": 31.64997100830078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5859375, - "step": 268, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9976623058319092 - }, - { - "episode": 4320, - "epoch": 0.02588345256497825, - "loss/policy_avg": -0.003022553399205208, - "lr": 9.828093047034766e-06, - "objective/entropy": 184.17962646484375, - "objective/kl": 34.98113250732422, - "objective/non_score_reward": -1.7490566968917847, - "objective/rlhf_reward": -5.480455153974232, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 177.02108764648438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.58984375, - "step": 269, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0039427280426025 - }, - { - "episode": 4336, - "epoch": 0.0259793172041078, - "loss/policy_avg": 0.22940891981124878, - "lr": 9.827453987730061e-06, - "objective/entropy": 97.79884338378906, - "objective/kl": 28.352216720581055, - "objective/non_score_reward": -1.4176108837127686, - "objective/rlhf_reward": -5.670443296432495, - "objective/scores": 0.0, - "policy/approxkl_avg": 50.27137756347656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7734375, - "step": 270, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.995865821838379 - }, - { - "episode": 4352, - "epoch": 0.026075181843237347, - "loss/policy_avg": 0.5798227190971375, - "lr": 9.826814928425358e-06, - "objective/entropy": 114.09043884277344, - "objective/kl": 30.19118309020996, - "objective/non_score_reward": -1.509559154510498, - "objective/rlhf_reward": -6.038236498832703, - "objective/scores": 0.0, - "policy/approxkl_avg": 34.826072692871094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6328125, - "step": 271, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0023837089538574 - }, - { - "episode": 4368, - "epoch": 0.026171046482366896, - "loss/policy_avg": -0.021535426378250122, - "lr": 9.826175869120655e-06, - "objective/entropy": -37.564857482910156, - "objective/kl": 18.688800811767578, - "objective/non_score_reward": -0.9344400763511658, - "objective/rlhf_reward": -5.737760543823242, - "objective/scores": -0.5, - "policy/approxkl_avg": 5.649078369140625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5859375, - "step": 272, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990510940551758 - }, - { - "episode": 4384, - "epoch": 0.02626691112149645, - "loss/policy_avg": 0.22535109519958496, - "lr": 9.825536809815952e-06, - "objective/entropy": 92.80372619628906, - "objective/kl": 36.460060119628906, - "objective/non_score_reward": -1.8230029344558716, - "objective/rlhf_reward": -5.9327618715509605, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 71.66783142089844, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6875, - "step": 273, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.996490716934204 - }, - { - "episode": 4400, - "epoch": 0.026362775760625998, - "loss/policy_avg": 0.1513216346502304, - "lr": 9.824897750511249e-06, - "objective/entropy": 107.88948059082031, - "objective/kl": 27.020774841308594, - "objective/non_score_reward": -1.3510388135910034, - "objective/rlhf_reward": -5.404155135154724, - "objective/scores": 0.0, - "policy/approxkl_avg": 2.633958339691162, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.353515625, - "step": 274, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999776840209961 - }, - { - "episode": 4416, - "epoch": 0.026458640399755547, - "loss/policy_avg": 0.2180587649345398, - "lr": 9.824258691206546e-06, - "objective/entropy": 237.6265411376953, - "objective/kl": 34.5337028503418, - "objective/non_score_reward": -1.7266850471496582, - "objective/rlhf_reward": -5.350481002536371, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 64.67985534667969, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7578125, - "step": 275, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0010175704956055 - }, - { - "episode": 4432, - "epoch": 0.026554505038885096, - "loss/policy_avg": 0.1868075728416443, - "lr": 9.823619631901841e-06, - "objective/entropy": 153.34646606445312, - "objective/kl": 37.78309631347656, - "objective/non_score_reward": -1.8891546726226807, - "objective/rlhf_reward": -7.556619048118591, - "objective/scores": 0.0, - "policy/approxkl_avg": 104.59550476074219, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.814453125, - "step": 276, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994056224822998 - }, - { - "episode": 4448, - "epoch": 0.026650369678014645, - "loss/policy_avg": 0.9853407144546509, - "lr": 9.822980572597138e-06, - "objective/entropy": 241.00967407226562, - "objective/kl": 34.986572265625, - "objective/non_score_reward": -1.74932861328125, - "objective/rlhf_reward": -5.393194708887654, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 219.53729248046875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7421875, - "step": 277, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9966635704040527 - }, - { - "episode": 4464, - "epoch": 0.026746234317144194, - "loss/policy_avg": -0.05517375469207764, - "lr": 9.822341513292433e-06, - "objective/entropy": -5.935462951660156, - "objective/kl": 28.799835205078125, - "objective/non_score_reward": -1.4399919509887695, - "objective/rlhf_reward": -7.75996732711792, - "objective/scores": -0.5, - "policy/approxkl_avg": 7.517549514770508, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.623046875, - "step": 278, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0015547275543213 - }, - { - "episode": 4480, - "epoch": 0.026842098956273742, - "loss/policy_avg": 0.5763638019561768, - "lr": 9.82170245398773e-06, - "objective/entropy": 109.34716796875, - "objective/kl": 33.456336975097656, - "objective/non_score_reward": -1.6728168725967407, - "objective/rlhf_reward": -6.691267490386963, - "objective/scores": 0.0, - "policy/approxkl_avg": 87.05907440185547, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.529296875, - "step": 279, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9927139282226562 - }, - { - "episode": 4496, - "epoch": 0.02693796359540329, - "loss/policy_avg": 0.060494083911180496, - "lr": 9.821063394683027e-06, - "objective/entropy": 73.89436340332031, - "objective/kl": 30.13658905029297, - "objective/non_score_reward": -1.5068295001983643, - "objective/rlhf_reward": -8.027318000793457, - "objective/scores": -0.5, - "policy/approxkl_avg": 238.1710662841797, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 280, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9975123405456543 - }, - { - "episode": 4512, - "epoch": 0.02703382823453284, - "loss/policy_avg": 1.1403638124465942, - "lr": 9.820424335378324e-06, - "objective/entropy": 78.37907409667969, - "objective/kl": 35.09062194824219, - "objective/non_score_reward": -1.7545311450958252, - "objective/rlhf_reward": -7.018124580383301, - "objective/scores": 0.0, - "policy/approxkl_avg": 51.29010772705078, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.544921875, - "step": 281, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9980716705322266 - }, - { - "episode": 4528, - "epoch": 0.02712969287366239, - "loss/policy_avg": 0.0838393121957779, - "lr": 9.81978527607362e-06, - "objective/entropy": 12.182060241699219, - "objective/kl": 30.722957611083984, - "objective/non_score_reward": -1.5361478328704834, - "objective/rlhf_reward": -4.197180102543767, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 42.573402404785156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.609375, - "step": 282, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99773371219635 - }, - { - "episode": 4544, - "epoch": 0.02722555751279194, - "loss/policy_avg": 0.10264723747968674, - "lr": 9.819146216768916e-06, - "objective/entropy": -15.691246032714844, - "objective/kl": 31.322179794311523, - "objective/non_score_reward": -1.5661091804504395, - "objective/rlhf_reward": -8.264436721801758, - "objective/scores": -0.5, - "policy/approxkl_avg": 216.12973022460938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.77734375, - "step": 283, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982627630233765 - }, - { - "episode": 4560, - "epoch": 0.027321422151921487, - "loss/policy_avg": -0.03270050510764122, - "lr": 9.818507157464213e-06, - "objective/entropy": 139.46694946289062, - "objective/kl": 25.990922927856445, - "objective/non_score_reward": -1.299546241760254, - "objective/rlhf_reward": -3.5940649843850903, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 66.3836441040039, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.62109375, - "step": 284, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0002379417419434 - }, - { - "episode": 4576, - "epoch": 0.027417286791051036, - "loss/policy_avg": -0.08705548942089081, - "lr": 9.81786809815951e-06, - "objective/entropy": 166.04830932617188, - "objective/kl": 24.707944869995117, - "objective/non_score_reward": -1.235397219657898, - "objective/rlhf_reward": -3.5415888786315914, - "objective/scores": 0.35, - "policy/approxkl_avg": 21.18993377685547, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.630859375, - "step": 285, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9989569187164307 - }, - { - "episode": 4592, - "epoch": 0.027513151430180585, - "loss/policy_avg": 0.03873559087514877, - "lr": 9.817229038854806e-06, - "objective/entropy": -7.895801544189453, - "objective/kl": 26.817386627197266, - "objective/non_score_reward": -1.340869426727295, - "objective/rlhf_reward": -3.9634773492813107, - "objective/scores": 0.35, - "policy/approxkl_avg": 94.96324920654297, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.689453125, - "step": 286, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0000836849212646 - }, - { - "episode": 4608, - "epoch": 0.027609016069310134, - "loss/policy_avg": 0.3947087824344635, - "lr": 9.816589979550103e-06, - "objective/entropy": 31.713714599609375, - "objective/kl": 35.37312698364258, - "objective/non_score_reward": -1.7686563730239868, - "objective/rlhf_reward": -7.074625730514526, - "objective/scores": 0.0, - "policy/approxkl_avg": 117.319091796875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.55078125, - "step": 287, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998248815536499 - }, - { - "episode": 4624, - "epoch": 0.027704880708439683, - "loss/policy_avg": -0.04924429580569267, - "lr": 9.8159509202454e-06, - "objective/entropy": 213.8193817138672, - "objective/kl": 33.728729248046875, - "objective/non_score_reward": -1.686436653137207, - "objective/rlhf_reward": -2.3457467317581173, - "objective/scores": 1.1, - "policy/approxkl_avg": 14.397890090942383, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8125, - "step": 288, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001190662384033 - }, - { - "episode": 4640, - "epoch": 0.027800745347569232, - "loss/policy_avg": 0.3702055513858795, - "lr": 9.815311860940695e-06, - "objective/entropy": 60.47701644897461, - "objective/kl": 28.081138610839844, - "objective/non_score_reward": -1.4040570259094238, - "objective/rlhf_reward": -5.616227865219116, - "objective/scores": 0.0, - "policy/approxkl_avg": 141.1427001953125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.52734375, - "step": 289, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983237981796265 - }, - { - "episode": 4656, - "epoch": 0.02789660998669878, - "loss/policy_avg": 0.6483702659606934, - "lr": 9.814672801635992e-06, - "objective/entropy": 56.31958770751953, - "objective/kl": 32.672027587890625, - "objective/non_score_reward": -1.633601427078247, - "objective/rlhf_reward": -8.534405708312988, - "objective/scores": -0.5, - "policy/approxkl_avg": 82.03401184082031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7890625, - "step": 290, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9977704286575317 - }, - { - "episode": 4672, - "epoch": 0.02799247462582833, - "loss/policy_avg": 0.2940463721752167, - "lr": 9.81403374233129e-06, - "objective/entropy": 116.48851013183594, - "objective/kl": 25.498628616333008, - "objective/non_score_reward": -1.2749314308166504, - "objective/rlhf_reward": -7.099725246429443, - "objective/scores": -0.5, - "policy/approxkl_avg": 66.54810333251953, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.80859375, - "step": 291, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9973530769348145 - }, - { - "episode": 4688, - "epoch": 0.02808833926495788, - "loss/policy_avg": 0.28353065252304077, - "lr": 9.813394683026586e-06, - "objective/entropy": -60.268192291259766, - "objective/kl": 31.477249145507812, - "objective/non_score_reward": -1.5738624334335327, - "objective/rlhf_reward": -6.295449614524841, - "objective/scores": 0.0, - "policy/approxkl_avg": 35.035682678222656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.888671875, - "step": 292, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0013890266418457 - }, - { - "episode": 4704, - "epoch": 0.028184203904087428, - "loss/policy_avg": 0.35568001866340637, - "lr": 9.812755623721883e-06, - "objective/entropy": 126.98811340332031, - "objective/kl": 31.8903865814209, - "objective/non_score_reward": -1.5945194959640503, - "objective/rlhf_reward": -8.37807846069336, - "objective/scores": -0.5, - "policy/approxkl_avg": 254.9582977294922, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4189453125, - "step": 293, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9979784488677979 - }, - { - "episode": 4720, - "epoch": 0.028280068543216977, - "loss/policy_avg": 0.2607693076133728, - "lr": 9.81211656441718e-06, - "objective/entropy": 25.59099578857422, - "objective/kl": 27.727397918701172, - "objective/non_score_reward": -1.3863699436187744, - "objective/rlhf_reward": -5.545479655265808, - "objective/scores": 0.0, - "policy/approxkl_avg": 4.56276273727417, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.640625, - "step": 294, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9985105991363525 - }, - { - "episode": 4736, - "epoch": 0.028375933182346526, - "loss/policy_avg": 0.09589973092079163, - "lr": 9.811477505112475e-06, - "objective/entropy": 60.00609588623047, - "objective/kl": 28.59209442138672, - "objective/non_score_reward": -1.4296045303344727, - "objective/rlhf_reward": -4.40773727556169, - "objective/scores": 0.327670248696953, - "policy/approxkl_avg": 5.154585361480713, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.568359375, - "step": 295, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0015292167663574 - }, - { - "episode": 4752, - "epoch": 0.028471797821476075, - "loss/policy_avg": -0.23297792673110962, - "lr": 9.810838445807772e-06, - "objective/entropy": 172.1509552001953, - "objective/kl": 28.202377319335938, - "objective/non_score_reward": -1.4101189374923706, - "objective/rlhf_reward": -5.640475809574127, - "objective/scores": 0.0, - "policy/approxkl_avg": 10.175811767578125, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.619140625, - "step": 296, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0159595012664795 - }, - { - "episode": 4768, - "epoch": 0.028567662460605624, - "loss/policy_avg": 0.24692611396312714, - "lr": 9.810199386503069e-06, - "objective/entropy": 82.19457244873047, - "objective/kl": 23.421173095703125, - "objective/non_score_reward": -1.1710586547851562, - "objective/rlhf_reward": -0.2842347383499142, - "objective/scores": 1.1, - "policy/approxkl_avg": 28.198326110839844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.765625, - "step": 297, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000041961669922 - }, - { - "episode": 4784, - "epoch": 0.028663527099735173, - "loss/policy_avg": 0.2046826034784317, - "lr": 9.809560327198366e-06, - "objective/entropy": 98.7589111328125, - "objective/kl": 31.546274185180664, - "objective/non_score_reward": -1.5773136615753174, - "objective/rlhf_reward": -8.309255599975586, - "objective/scores": -0.5, - "policy/approxkl_avg": 203.54513549804688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.697265625, - "step": 298, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9957282543182373 - }, - { - "episode": 4800, - "epoch": 0.02875939173886472, - "loss/policy_avg": 0.24512505531311035, - "lr": 9.808921267893663e-06, - "objective/entropy": 206.71981811523438, - "objective/kl": 26.81441307067871, - "objective/non_score_reward": -1.3407206535339355, - "objective/rlhf_reward": -5.362882316112518, - "objective/scores": 0.0, - "policy/approxkl_avg": 163.9492950439453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.796875, - "step": 299, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.996509313583374 - }, - { - "episode": 4816, - "epoch": 0.02885525637799427, - "loss/policy_avg": 0.028275877237319946, - "lr": 9.808282208588958e-06, - "objective/entropy": -18.974689483642578, - "objective/kl": 37.04328536987305, - "objective/non_score_reward": -1.852164387702942, - "objective/rlhf_reward": -9.40865707397461, - "objective/scores": -0.5, - "policy/approxkl_avg": 33.29435348510742, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.666015625, - "step": 300, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9966996908187866 - }, - { - "episode": 4832, - "epoch": 0.02895112101712382, - "loss/policy_avg": 0.1949348747730255, - "lr": 9.807643149284255e-06, - "objective/entropy": 83.58306121826172, - "objective/kl": 37.85429000854492, - "objective/non_score_reward": -1.8927146196365356, - "objective/rlhf_reward": -9.570858001708984, - "objective/scores": -0.5, - "policy/approxkl_avg": 69.484619140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.611328125, - "step": 301, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9983818531036377 - }, - { - "episode": 4848, - "epoch": 0.029046985656253372, - "loss/policy_avg": -0.1876036524772644, - "lr": 9.80700408997955e-06, - "objective/entropy": -33.166542053222656, - "objective/kl": 29.514928817749023, - "objective/non_score_reward": -1.4757463932037354, - "objective/rlhf_reward": -5.902985334396362, - "objective/scores": 0.0, - "policy/approxkl_avg": 21.677946090698242, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.58984375, - "step": 302, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998695731163025 - }, - { - "episode": 4864, - "epoch": 0.02914285029538292, - "loss/policy_avg": 0.022164881229400635, - "lr": 9.806365030674847e-06, - "objective/entropy": 94.1938247680664, - "objective/kl": 34.645530700683594, - "objective/non_score_reward": -1.7322763204574585, - "objective/rlhf_reward": -8.929105758666992, - "objective/scores": -0.5, - "policy/approxkl_avg": 211.6139373779297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.779296875, - "step": 303, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997363805770874 - }, - { - "episode": 4880, - "epoch": 0.02923871493451247, - "loss/policy_avg": 0.7907856106758118, - "lr": 9.805725971370144e-06, - "objective/entropy": 63.40114212036133, - "objective/kl": 32.65860366821289, - "objective/non_score_reward": -1.6329302787780762, - "objective/rlhf_reward": -6.531721353530884, - "objective/scores": 0.0, - "policy/approxkl_avg": 12.615059852600098, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8515625, - "step": 304, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0000805854797363 - }, - { - "episode": 4896, - "epoch": 0.02933457957364202, - "loss/policy_avg": 0.07148364931344986, - "lr": 9.80508691206544e-06, - "objective/entropy": 88.27102661132812, - "objective/kl": 39.98832702636719, - "objective/non_score_reward": -1.9994162321090698, - "objective/rlhf_reward": -5.073946271778318, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 129.564208984375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.533203125, - "step": 305, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998765468597412 - }, - { - "episode": 4912, - "epoch": 0.029430444212771568, - "loss/policy_avg": 0.17122961580753326, - "lr": 9.804447852760737e-06, - "objective/entropy": -49.38551330566406, - "objective/kl": 24.57117462158203, - "objective/non_score_reward": -1.2285586595535278, - "objective/rlhf_reward": -6.914234638214111, - "objective/scores": -0.5, - "policy/approxkl_avg": 7.128786087036133, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.755859375, - "step": 306, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9974303245544434 - }, - { - "episode": 4928, - "epoch": 0.029526308851901117, - "loss/policy_avg": 0.2127828598022461, - "lr": 9.803808793456034e-06, - "objective/entropy": 75.18860626220703, - "objective/kl": 26.683685302734375, - "objective/non_score_reward": -1.3341842889785767, - "objective/rlhf_reward": -5.336737275123596, - "objective/scores": 0.0, - "policy/approxkl_avg": 5.522137641906738, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6875, - "step": 307, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001485824584961 - }, - { - "episode": 4944, - "epoch": 0.029622173491030666, - "loss/policy_avg": -0.11516772210597992, - "lr": 9.80316973415133e-06, - "objective/entropy": -0.04103279113769531, - "objective/kl": 38.64908218383789, - "objective/non_score_reward": -1.932453989982605, - "objective/rlhf_reward": -6.067956691206085, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 4.6061835289001465, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.724609375, - "step": 308, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000288248062134 - }, - { - "episode": 4960, - "epoch": 0.029718038130160215, - "loss/policy_avg": 0.2119406759738922, - "lr": 9.802530674846626e-06, - "objective/entropy": 145.13497924804688, - "objective/kl": 37.7593879699707, - "objective/non_score_reward": -1.8879692554473877, - "objective/rlhf_reward": -9.55187702178955, - "objective/scores": -0.5, - "policy/approxkl_avg": 189.22616577148438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.447265625, - "step": 309, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9981553554534912 - }, - { - "episode": 4976, - "epoch": 0.029813902769289764, - "loss/policy_avg": 0.3452683091163635, - "lr": 9.801891615541923e-06, - "objective/entropy": 311.71026611328125, - "objective/kl": 26.775503158569336, - "objective/non_score_reward": -1.3387750387191772, - "objective/rlhf_reward": -5.3551002740859985, - "objective/scores": 0.0, - "policy/approxkl_avg": 134.57427978515625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.978515625, - "step": 310, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.99644136428833 - }, - { - "episode": 4992, - "epoch": 0.029909767408419313, - "loss/policy_avg": -0.21734583377838135, - "lr": 9.80125255623722e-06, - "objective/entropy": -49.33317184448242, - "objective/kl": 24.155736923217773, - "objective/non_score_reward": -1.2077867984771729, - "objective/rlhf_reward": -4.8311474323272705, - "objective/scores": 0.0, - "policy/approxkl_avg": 31.801761627197266, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.59765625, - "step": 311, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9999210834503174 - }, - { - "episode": 5008, - "epoch": 0.03000563204754886, - "loss/policy_avg": -0.4189864993095398, - "lr": 9.800613496932517e-06, - "objective/entropy": 176.72503662109375, - "objective/kl": 36.288841247558594, - "objective/non_score_reward": -1.8144421577453613, - "objective/rlhf_reward": -5.833936293323603, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 94.41351318359375, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.552734375, - "step": 312, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.006214141845703 - }, - { - "episode": 5024, - "epoch": 0.03010149668667841, - "loss/policy_avg": 0.5682752132415771, - "lr": 9.799974437627812e-06, - "objective/entropy": 71.7433090209961, - "objective/kl": 31.988750457763672, - "objective/non_score_reward": -1.5994374752044678, - "objective/rlhf_reward": -8.397749900817871, - "objective/scores": -0.5, - "policy/approxkl_avg": 33.52442932128906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.662109375, - "step": 313, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9989948272705078 - }, - { - "episode": 5040, - "epoch": 0.03019736132580796, - "loss/policy_avg": 0.5441787242889404, - "lr": 9.799335378323109e-06, - "objective/entropy": 214.158447265625, - "objective/kl": 31.045618057250977, - "objective/non_score_reward": -1.5522809028625488, - "objective/rlhf_reward": -1.8091239094734188, - "objective/scores": 1.1, - "policy/approxkl_avg": 58.665191650390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.876953125, - "step": 314, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988675117492676 - }, - { - "episode": 5056, - "epoch": 0.03029322596493751, - "loss/policy_avg": 0.5254380702972412, - "lr": 9.798696319018406e-06, - "objective/entropy": -19.816749572753906, - "objective/kl": 39.83454895019531, - "objective/non_score_reward": -1.991727590560913, - "objective/rlhf_reward": -5.043191228748533, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 79.64820861816406, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6328125, - "step": 315, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973756074905396 - }, - { - "episode": 5072, - "epoch": 0.030389090604067057, - "loss/policy_avg": 0.2726515531539917, - "lr": 9.798057259713703e-06, - "objective/entropy": -4.6614837646484375, - "objective/kl": 35.06428527832031, - "objective/non_score_reward": -1.7532143592834473, - "objective/rlhf_reward": -9.012857437133789, - "objective/scores": -0.5, - "policy/approxkl_avg": 54.6142578125, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5234375, - "step": 316, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9981968402862549 - }, - { - "episode": 5088, - "epoch": 0.030484955243196606, - "loss/policy_avg": 0.26141488552093506, - "lr": 9.797418200409e-06, - "objective/entropy": 47.011985778808594, - "objective/kl": 45.61205291748047, - "objective/non_score_reward": -2.28060245513916, - "objective/rlhf_reward": -11.12240982055664, - "objective/scores": -0.5, - "policy/approxkl_avg": 180.7010498046875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.634765625, - "step": 317, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.997007131576538 - }, - { - "episode": 5104, - "epoch": 0.030580819882326155, - "loss/policy_avg": 0.4122789800167084, - "lr": 9.796779141104296e-06, - "objective/entropy": 94.4898452758789, - "objective/kl": 28.151004791259766, - "objective/non_score_reward": -1.4075502157211304, - "objective/rlhf_reward": -7.6302008628845215, - "objective/scores": -0.5, - "policy/approxkl_avg": 103.5096664428711, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.615234375, - "step": 318, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992344379425049 - }, - { - "episode": 5120, - "epoch": 0.030676684521455704, - "loss/policy_avg": 0.0029612816870212555, - "lr": 9.796140081799592e-06, - "objective/entropy": 198.75570678710938, - "objective/kl": 25.085525512695312, - "objective/non_score_reward": -1.2542762756347656, - "objective/rlhf_reward": -5.017104983329773, - "objective/scores": 0.0, - "policy/approxkl_avg": 120.64035034179688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.84375, - "step": 319, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9988009929656982 - }, - { - "episode": 5136, - "epoch": 0.030772549160585253, - "loss/policy_avg": 0.3600286841392517, - "lr": 9.795501022494888e-06, - "objective/entropy": 42.533729553222656, - "objective/kl": 28.457992553710938, - "objective/non_score_reward": -1.422899603843689, - "objective/rlhf_reward": -5.691598415374756, - "objective/scores": 0.0, - "policy/approxkl_avg": 73.802490234375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5234375, - "step": 320, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9940855503082275 - }, - { - "episode": 5152, - "epoch": 0.030868413799714802, - "loss/policy_avg": -0.13510574400424957, - "lr": 9.794861963190185e-06, - "objective/entropy": -5.396385192871094, - "objective/kl": 22.229045867919922, - "objective/non_score_reward": -1.111452341079712, - "objective/rlhf_reward": -2.9952114923909754, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 45.65439987182617, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70703125, - "step": 321, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999731183052063 - }, - { - "episode": 5168, - "epoch": 0.03096427843884435, - "loss/policy_avg": -0.15009954571723938, - "lr": 9.794222903885482e-06, - "objective/entropy": -7.345497131347656, - "objective/kl": 32.8961181640625, - "objective/non_score_reward": -1.644805908203125, - "objective/rlhf_reward": -8.5792236328125, - "objective/scores": -0.5, - "policy/approxkl_avg": 14.027180671691895, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.525390625, - "step": 322, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.002540111541748 - }, - { - "episode": 5184, - "epoch": 0.0310601430779739, - "loss/policy_avg": -0.0007271356880664825, - "lr": 9.793583844580777e-06, - "objective/entropy": 163.2379608154297, - "objective/kl": 29.110525131225586, - "objective/non_score_reward": -1.455526351928711, - "objective/rlhf_reward": -5.822105169296265, - "objective/scores": 0.0, - "policy/approxkl_avg": 22.514135360717773, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4892578125, - "step": 323, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.010396718978882 - }, - { - "episode": 5200, - "epoch": 0.03115600771710345, - "loss/policy_avg": 0.19244720041751862, - "lr": 9.792944785276074e-06, - "objective/entropy": -23.184280395507812, - "objective/kl": 28.509687423706055, - "objective/non_score_reward": -1.4254844188690186, - "objective/rlhf_reward": -1.3019374370574948, - "objective/scores": 1.1, - "policy/approxkl_avg": 88.21593475341797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.734375, - "step": 324, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985356330871582 - }, - { - "episode": 5216, - "epoch": 0.031251872356233, - "loss/policy_avg": 0.12977905571460724, - "lr": 9.792305725971371e-06, - "objective/entropy": 150.05477905273438, - "objective/kl": 31.353958129882812, - "objective/non_score_reward": -1.5676978826522827, - "objective/rlhf_reward": -6.270791292190552, - "objective/scores": 0.0, - "policy/approxkl_avg": 17.6948184967041, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.474609375, - "step": 325, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000549077987671 - }, - { - "episode": 5232, - "epoch": 0.03134773699536255, - "loss/policy_avg": 1.3926464319229126, - "lr": 9.791666666666666e-06, - "objective/entropy": 145.20407104492188, - "objective/kl": 43.132911682128906, - "objective/non_score_reward": -2.1566455364227295, - "objective/rlhf_reward": -10.626582145690918, - "objective/scores": -0.5, - "policy/approxkl_avg": 143.59188842773438, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.703125, - "step": 326, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001438617706299 - }, - { - "episode": 5248, - "epoch": 0.0314436016344921, - "loss/policy_avg": -0.06288231909275055, - "lr": 9.791027607361963e-06, - "objective/entropy": -0.5038261413574219, - "objective/kl": 29.548839569091797, - "objective/non_score_reward": -1.4774420261383057, - "objective/rlhf_reward": -4.584255132704897, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 33.57347869873047, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4208984375, - "step": 327, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0011537075042725 - }, - { - "episode": 5264, - "epoch": 0.03153946627362165, - "loss/policy_avg": 0.04387858510017395, - "lr": 9.79038854805726e-06, - "objective/entropy": 33.670188903808594, - "objective/kl": 19.800825119018555, - "objective/non_score_reward": -0.9900413751602173, - "objective/rlhf_reward": -5.960165500640869, - "objective/scores": -0.5, - "policy/approxkl_avg": 31.26099967956543, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.640625, - "step": 328, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9964679479599 - }, - { - "episode": 5280, - "epoch": 0.0316353309127512, - "loss/policy_avg": 0.08830268681049347, - "lr": 9.789749488752557e-06, - "objective/entropy": 77.01775360107422, - "objective/kl": 33.558563232421875, - "objective/non_score_reward": -1.6779282093048096, - "objective/rlhf_reward": -8.711712837219238, - "objective/scores": -0.5, - "policy/approxkl_avg": 3.013957977294922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6796875, - "step": 329, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998348593711853 - }, - { - "episode": 5296, - "epoch": 0.031731195551880746, - "loss/policy_avg": 0.39634984731674194, - "lr": 9.789110429447854e-06, - "objective/entropy": -55.59328079223633, - "objective/kl": 27.393354415893555, - "objective/non_score_reward": -1.3696677684783936, - "objective/rlhf_reward": -5.47867077589035, - "objective/scores": 0.0, - "policy/approxkl_avg": 17.402725219726562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.525390625, - "step": 330, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9996036291122437 - }, - { - "episode": 5312, - "epoch": 0.031827060191010295, - "loss/policy_avg": 0.28659260272979736, - "lr": 9.78847137014315e-06, - "objective/entropy": -92.15465545654297, - "objective/kl": 37.35984802246094, - "objective/non_score_reward": -1.8679924011230469, - "objective/rlhf_reward": -7.471969723701477, - "objective/scores": 0.0, - "policy/approxkl_avg": 128.93292236328125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.646484375, - "step": 331, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999650239944458 - }, - { - "episode": 5328, - "epoch": 0.031922924830139844, - "loss/policy_avg": 0.04176933690905571, - "lr": 9.787832310838446e-06, - "objective/entropy": 134.93870544433594, - "objective/kl": 36.91297149658203, - "objective/non_score_reward": -1.8456485271453857, - "objective/rlhf_reward": -9.38259506225586, - "objective/scores": -0.5, - "policy/approxkl_avg": 10.31844711303711, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.806640625, - "step": 332, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9973440170288086 - }, - { - "episode": 5344, - "epoch": 0.03201878946926939, - "loss/policy_avg": 0.12372894585132599, - "lr": 9.787193251533743e-06, - "objective/entropy": 71.03854370117188, - "objective/kl": 31.68985366821289, - "objective/non_score_reward": -1.5844929218292236, - "objective/rlhf_reward": -8.337971687316895, - "objective/scores": -0.5, - "policy/approxkl_avg": 47.733943939208984, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.783203125, - "step": 333, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001028060913086 - }, - { - "episode": 5360, - "epoch": 0.03211465410839894, - "loss/policy_avg": 0.2358803153038025, - "lr": 9.78655419222904e-06, - "objective/entropy": 115.29727172851562, - "objective/kl": 28.339174270629883, - "objective/non_score_reward": -1.4169588088989258, - "objective/rlhf_reward": -7.667835235595703, - "objective/scores": -0.5, - "policy/approxkl_avg": 44.00404357910156, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.80859375, - "step": 334, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.99520742893219 - }, - { - "episode": 5376, - "epoch": 0.03221051874752849, - "loss/policy_avg": 0.023874476552009583, - "lr": 9.785915132924337e-06, - "objective/entropy": -10.510528564453125, - "objective/kl": 35.899017333984375, - "objective/non_score_reward": -1.7949509620666504, - "objective/rlhf_reward": -9.179803848266602, - "objective/scores": -0.5, - "policy/approxkl_avg": 134.71612548828125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.642578125, - "step": 335, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9987369775772095 - }, - { - "episode": 5392, - "epoch": 0.03230638338665804, - "loss/policy_avg": 0.2157980352640152, - "lr": 9.785276073619633e-06, - "objective/entropy": -117.77426147460938, - "objective/kl": 36.19778060913086, - "objective/non_score_reward": -1.8098891973495483, - "objective/rlhf_reward": -7.239556908607483, - "objective/scores": 0.0, - "policy/approxkl_avg": 5.148270606994629, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.66015625, - "step": 336, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9979586601257324 - }, - { - "episode": 5408, - "epoch": 0.03240224802578759, - "loss/policy_avg": 0.5062718987464905, - "lr": 9.784637014314929e-06, - "objective/entropy": 2.911235809326172, - "objective/kl": 29.895044326782227, - "objective/non_score_reward": -1.494752287864685, - "objective/rlhf_reward": -4.668328216275572, - "objective/scores": 0.327670248696953, - "policy/approxkl_avg": 57.235164642333984, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.58984375, - "step": 337, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989079236984253 - }, - { - "episode": 5424, - "epoch": 0.03249811266491714, - "loss/policy_avg": 0.35139840841293335, - "lr": 9.783997955010226e-06, - "objective/entropy": 17.953224182128906, - "objective/kl": 31.853174209594727, - "objective/non_score_reward": -1.5926587581634521, - "objective/rlhf_reward": -4.766515169207173, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 21.75585174560547, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.787109375, - "step": 338, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9962561130523682 - }, - { - "episode": 5440, - "epoch": 0.03259397730404669, - "loss/policy_avg": 0.5215449333190918, - "lr": 9.783358895705522e-06, - "objective/entropy": -65.21415710449219, - "objective/kl": 41.145973205566406, - "objective/non_score_reward": -2.0572988986968994, - "objective/rlhf_reward": -10.229194641113281, - "objective/scores": -0.5, - "policy/approxkl_avg": 80.21241760253906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.75, - "step": 339, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977318048477173 - }, - { - "episode": 5456, - "epoch": 0.032689841943176236, - "loss/policy_avg": 0.28369078040122986, - "lr": 9.78271983640082e-06, - "objective/entropy": 94.98407745361328, - "objective/kl": 35.06120300292969, - "objective/non_score_reward": -1.7530601024627686, - "objective/rlhf_reward": -9.01224136352539, - "objective/scores": -0.5, - "policy/approxkl_avg": 24.02906036376953, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.587890625, - "step": 340, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989793300628662 - }, - { - "episode": 5472, - "epoch": 0.032785706582305785, - "loss/policy_avg": 1.9119699001312256, - "lr": 9.782080777096116e-06, - "objective/entropy": 103.6617202758789, - "objective/kl": 43.41075134277344, - "objective/non_score_reward": -2.1705374717712402, - "objective/rlhf_reward": -8.68215036392212, - "objective/scores": 0.0, - "policy/approxkl_avg": 27.09801483154297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.548828125, - "step": 341, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000924825668335 - }, - { - "episode": 5488, - "epoch": 0.032881571221435334, - "loss/policy_avg": 0.5120334625244141, - "lr": 9.781441717791413e-06, - "objective/entropy": 167.12762451171875, - "objective/kl": 35.880027770996094, - "objective/non_score_reward": -1.7940013408660889, - "objective/rlhf_reward": -9.176005363464355, - "objective/scores": -0.5, - "policy/approxkl_avg": 70.14405822753906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.630859375, - "step": 342, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0025668144226074 - }, - { - "episode": 5504, - "epoch": 0.03297743586056488, - "loss/policy_avg": 0.015120374038815498, - "lr": 9.780802658486708e-06, - "objective/entropy": 205.39503479003906, - "objective/kl": 41.50642395019531, - "objective/non_score_reward": -2.0753211975097656, - "objective/rlhf_reward": -8.301284790039062, - "objective/scores": 0.0, - "policy/approxkl_avg": 1.8751791715621948, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7890625, - "step": 343, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0046072006225586 - }, - { - "episode": 5520, - "epoch": 0.03307330049969443, - "loss/policy_avg": 1.035445213317871, - "lr": 9.780163599182005e-06, - "objective/entropy": 91.37589263916016, - "objective/kl": 35.44560241699219, - "objective/non_score_reward": -1.7722800970077515, - "objective/rlhf_reward": -5.638522367091522, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 18.435420989990234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.65234375, - "step": 344, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999558925628662 - }, - { - "episode": 5536, - "epoch": 0.03316916513882398, - "loss/policy_avg": 0.07220157980918884, - "lr": 9.7795245398773e-06, - "objective/entropy": 109.74974822998047, - "objective/kl": 30.194711685180664, - "objective/non_score_reward": -1.5097355842590332, - "objective/rlhf_reward": -4.660340347377163, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 8.997635841369629, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.498046875, - "step": 345, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9951503276824951 - }, - { - "episode": 5552, - "epoch": 0.03326502977795353, - "loss/policy_avg": -0.0515998937189579, - "lr": 9.778885480572597e-06, - "objective/entropy": 139.16102600097656, - "objective/kl": 23.493303298950195, - "objective/non_score_reward": -1.1746652126312256, - "objective/rlhf_reward": -4.698660850524902, - "objective/scores": 0.0, - "policy/approxkl_avg": 26.937942504882812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.654296875, - "step": 346, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000459909439087 - }, - { - "episode": 5568, - "epoch": 0.03336089441708308, - "loss/policy_avg": 0.5854986906051636, - "lr": 9.778246421267894e-06, - "objective/entropy": -27.624618530273438, - "objective/kl": 25.353118896484375, - "objective/non_score_reward": -1.2676559686660767, - "objective/rlhf_reward": -3.466503713194447, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 65.4859619140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.57421875, - "step": 347, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9998695850372314 - }, - { - "episode": 5584, - "epoch": 0.03345675905621263, - "loss/policy_avg": 0.13264235854148865, - "lr": 9.777607361963191e-06, - "objective/entropy": 180.84451293945312, - "objective/kl": 29.817018508911133, - "objective/non_score_reward": -1.4908509254455566, - "objective/rlhf_reward": -4.60415395471899, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 229.28366088867188, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.74609375, - "step": 348, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9972574710845947 - }, - { - "episode": 5600, - "epoch": 0.033552623695342176, - "loss/policy_avg": 0.0762765109539032, - "lr": 9.776968302658488e-06, - "objective/entropy": 60.79357147216797, - "objective/kl": 42.37275314331055, - "objective/non_score_reward": -2.1186375617980957, - "objective/rlhf_reward": -10.474550247192383, - "objective/scores": -0.5, - "policy/approxkl_avg": 1.9780604839324951, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.595703125, - "step": 349, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0007505416870117 - }, - { - "episode": 5616, - "epoch": 0.033648488334471725, - "loss/policy_avg": 0.056807953864336014, - "lr": 9.776329243353783e-06, - "objective/entropy": 52.78253173828125, - "objective/kl": 34.426055908203125, - "objective/non_score_reward": -1.7213029861450195, - "objective/rlhf_reward": -5.404259326870799, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 43.55577087402344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.578125, - "step": 350, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9966363906860352 - }, - { - "episode": 5632, - "epoch": 0.033744352973601274, - "loss/policy_avg": -0.12831875681877136, - "lr": 9.77569018404908e-06, - "objective/entropy": 16.329566955566406, - "objective/kl": 32.15143585205078, - "objective/non_score_reward": -1.6075717210769653, - "objective/rlhf_reward": -8.430286407470703, - "objective/scores": -0.5, - "policy/approxkl_avg": 75.677978515625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.50390625, - "step": 351, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998267412185669 - }, - { - "episode": 5648, - "epoch": 0.03384021761273082, - "loss/policy_avg": 1.4532546997070312, - "lr": 9.775051124744377e-06, - "objective/entropy": 74.94012451171875, - "objective/kl": 27.538991928100586, - "objective/non_score_reward": -1.3769495487213135, - "objective/rlhf_reward": -7.507798194885254, - "objective/scores": -0.5, - "policy/approxkl_avg": 37.15715026855469, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.75, - "step": 352, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0019326210021973 - }, - { - "episode": 5664, - "epoch": 0.03393608225186037, - "loss/policy_avg": 0.18699043989181519, - "lr": 9.774412065439674e-06, - "objective/entropy": 154.23028564453125, - "objective/kl": 26.423828125, - "objective/non_score_reward": -1.3211913108825684, - "objective/rlhf_reward": -7.284765243530273, - "objective/scores": -0.5, - "policy/approxkl_avg": 43.985931396484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4609375, - "step": 353, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.997018575668335 - }, - { - "episode": 5680, - "epoch": 0.03403194689098992, - "loss/policy_avg": 0.4292873740196228, - "lr": 9.77377300613497e-06, - "objective/entropy": 69.04763793945312, - "objective/kl": 29.13799285888672, - "objective/non_score_reward": -1.456899642944336, - "objective/rlhf_reward": -7.8275980949401855, - "objective/scores": -0.5, - "policy/approxkl_avg": 24.5076904296875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.681640625, - "step": 354, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984169006347656 - }, - { - "episode": 5696, - "epoch": 0.03412781153011947, - "loss/policy_avg": 0.3388688564300537, - "lr": 9.773133946830267e-06, - "objective/entropy": 138.8565673828125, - "objective/kl": 36.0447998046875, - "objective/non_score_reward": -1.802240014076233, - "objective/rlhf_reward": -7.208960175514221, - "objective/scores": 0.0, - "policy/approxkl_avg": 74.22533416748047, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.79296875, - "step": 355, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992218017578125 - }, - { - "episode": 5712, - "epoch": 0.03422367616924902, - "loss/policy_avg": 0.12512633204460144, - "lr": 9.772494887525563e-06, - "objective/entropy": 74.25532531738281, - "objective/kl": 22.747737884521484, - "objective/non_score_reward": -1.1373867988586426, - "objective/rlhf_reward": -6.54954719543457, - "objective/scores": -0.5, - "policy/approxkl_avg": 3.5538787841796875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625, - "step": 356, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998750925064087 - }, - { - "episode": 5728, - "epoch": 0.03431954080837857, - "loss/policy_avg": 0.19646649062633514, - "lr": 9.77185582822086e-06, - "objective/entropy": 91.78384399414062, - "objective/kl": 33.680171966552734, - "objective/non_score_reward": -1.6840085983276367, - "objective/rlhf_reward": -5.357431986419064, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 234.2763214111328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.521484375, - "step": 357, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.99714994430542 - }, - { - "episode": 5744, - "epoch": 0.03441540544750812, - "loss/policy_avg": 0.1719483882188797, - "lr": 9.771216768916156e-06, - "objective/entropy": 15.523513793945312, - "objective/kl": 30.582561492919922, - "objective/non_score_reward": -1.529128074645996, - "objective/rlhf_reward": -6.116512298583984, - "objective/scores": 0.0, - "policy/approxkl_avg": 51.08443832397461, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.408203125, - "step": 358, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0013043880462646 - }, - { - "episode": 5760, - "epoch": 0.034511270086637666, - "loss/policy_avg": 0.24180445075035095, - "lr": 9.770577709611453e-06, - "objective/entropy": 118.51576232910156, - "objective/kl": 42.176387786865234, - "objective/non_score_reward": -2.1088194847106934, - "objective/rlhf_reward": -8.435277700424194, - "objective/scores": 0.0, - "policy/approxkl_avg": 118.47047424316406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.583984375, - "step": 359, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.995426893234253 - }, - { - "episode": 5776, - "epoch": 0.034607134725767215, - "loss/policy_avg": 0.43327879905700684, - "lr": 9.76993865030675e-06, - "objective/entropy": 136.4874267578125, - "objective/kl": 45.31035232543945, - "objective/non_score_reward": -2.2655177116394043, - "objective/rlhf_reward": -9.062070846557617, - "objective/scores": 0.0, - "policy/approxkl_avg": 17.986034393310547, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.396484375, - "step": 360, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9967424869537354 - }, - { - "episode": 5792, - "epoch": 0.034702999364896764, - "loss/policy_avg": 0.19567860662937164, - "lr": 9.769299591002045e-06, - "objective/entropy": 42.139869689941406, - "objective/kl": 34.16746520996094, - "objective/non_score_reward": -1.7083733081817627, - "objective/rlhf_reward": -6.83349347114563, - "objective/scores": 0.0, - "policy/approxkl_avg": 63.83492660522461, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6875, - "step": 361, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9962726831436157 - }, - { - "episode": 5808, - "epoch": 0.03479886400402631, - "loss/policy_avg": 0.22219505906105042, - "lr": 9.768660531697342e-06, - "objective/entropy": 58.672523498535156, - "objective/kl": 36.7076416015625, - "objective/non_score_reward": -1.8353819847106934, - "objective/rlhf_reward": -5.825756037028965, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 122.09330749511719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4423828125, - "step": 362, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.996779441833496 - }, - { - "episode": 5824, - "epoch": 0.03489472864315586, - "loss/policy_avg": 0.15133829414844513, - "lr": 9.768021472392639e-06, - "objective/entropy": 178.90162658691406, - "objective/kl": 45.38115692138672, - "objective/non_score_reward": -2.2690577507019043, - "objective/rlhf_reward": -4.676230645179748, - "objective/scores": 1.1, - "policy/approxkl_avg": 104.61646270751953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.529296875, - "step": 363, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.996953010559082 - }, - { - "episode": 5840, - "epoch": 0.03499059328228541, - "loss/policy_avg": -0.021383460611104965, - "lr": 9.767382413087936e-06, - "objective/entropy": -69.16898345947266, - "objective/kl": 22.05933380126953, - "objective/non_score_reward": -1.1029666662216187, - "objective/rlhf_reward": -2.289160372988258, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 29.110021591186523, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70703125, - "step": 364, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00262188911438 - }, - { - "episode": 5856, - "epoch": 0.03508645792141496, - "loss/policy_avg": -0.04033146798610687, - "lr": 9.766743353783233e-06, - "objective/entropy": 141.72869873046875, - "objective/kl": 44.84279251098633, - "objective/non_score_reward": -2.2421395778656006, - "objective/rlhf_reward": -10.968558311462402, - "objective/scores": -0.5, - "policy/approxkl_avg": 186.49794006347656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.74609375, - "step": 365, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999026894569397 - }, - { - "episode": 5872, - "epoch": 0.03518232256054451, - "loss/policy_avg": 0.4079732298851013, - "lr": 9.76610429447853e-06, - "objective/entropy": 111.8157958984375, - "objective/kl": 36.629844665527344, - "objective/non_score_reward": -1.8314921855926514, - "objective/rlhf_reward": -9.325968742370605, - "objective/scores": -0.5, - "policy/approxkl_avg": 5.762906551361084, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.751953125, - "step": 366, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0008926391601562 - }, - { - "episode": 5888, - "epoch": 0.03527818719967406, - "loss/policy_avg": 0.04947970062494278, - "lr": 9.765465235173825e-06, - "objective/entropy": 191.2625274658203, - "objective/kl": 34.450801849365234, - "objective/non_score_reward": -1.722540259361267, - "objective/rlhf_reward": -6.890160799026489, - "objective/scores": 0.0, - "policy/approxkl_avg": 4.018008232116699, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.537109375, - "step": 367, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.001461982727051 - }, - { - "episode": 5904, - "epoch": 0.03537405183880361, - "loss/policy_avg": 0.0687071904540062, - "lr": 9.764826175869122e-06, - "objective/entropy": 83.29997253417969, - "objective/kl": 22.66602325439453, - "objective/non_score_reward": -1.1333011388778687, - "objective/rlhf_reward": -4.533204555511475, - "objective/scores": 0.0, - "policy/approxkl_avg": 1.5386806726455688, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.353515625, - "step": 368, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999902248382568 - }, - { - "episode": 5920, - "epoch": 0.035469916477933155, - "loss/policy_avg": 0.2985497713088989, - "lr": 9.764187116564417e-06, - "objective/entropy": 139.81521606445312, - "objective/kl": 24.542152404785156, - "objective/non_score_reward": -1.2271076440811157, - "objective/rlhf_reward": -4.908430516719818, - "objective/scores": 0.0, - "policy/approxkl_avg": 34.6478271484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.41796875, - "step": 369, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0018105506896973 - }, - { - "episode": 5936, - "epoch": 0.035565781117062704, - "loss/policy_avg": 0.08180458098649979, - "lr": 9.763548057259714e-06, - "objective/entropy": -5.604040145874023, - "objective/kl": 37.0791015625, - "objective/non_score_reward": -1.8539552688598633, - "objective/rlhf_reward": -5.859561531749323, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 2.4335670471191406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.580078125, - "step": 370, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0005955696105957 - }, - { - "episode": 5952, - "epoch": 0.03566164575619225, - "loss/policy_avg": 0.506417453289032, - "lr": 9.76290899795501e-06, - "objective/entropy": 35.1672248840332, - "objective/kl": 32.360355377197266, - "objective/non_score_reward": -1.618017554283142, - "objective/rlhf_reward": -5.161389222344756, - "objective/scores": 0.327670248696953, - "policy/approxkl_avg": 189.36500549316406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.615234375, - "step": 371, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9977048635482788 - }, - { - "episode": 5968, - "epoch": 0.0357575103953218, - "loss/policy_avg": 0.265750527381897, - "lr": 9.762269938650308e-06, - "objective/entropy": 194.55935668945312, - "objective/kl": 37.323184967041016, - "objective/non_score_reward": -1.866159200668335, - "objective/rlhf_reward": -3.064636981487274, - "objective/scores": 1.1, - "policy/approxkl_avg": 69.688232421875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625, - "step": 372, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9976245164871216 - }, - { - "episode": 5984, - "epoch": 0.03585337503445135, - "loss/policy_avg": 0.7971226572990417, - "lr": 9.761630879345604e-06, - "objective/entropy": 8.025199890136719, - "objective/kl": 27.826885223388672, - "objective/non_score_reward": -1.3913441896438599, - "objective/rlhf_reward": -5.5653769969940186, - "objective/scores": 0.0, - "policy/approxkl_avg": 5.6349968910217285, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4296875, - "step": 373, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0026164054870605 - }, - { - "episode": 6000, - "epoch": 0.03594923967358091, - "loss/policy_avg": 0.40386438369750977, - "lr": 9.7609918200409e-06, - "objective/entropy": -74.85362243652344, - "objective/kl": 34.3883056640625, - "objective/non_score_reward": -1.7194151878356934, - "objective/rlhf_reward": -6.8776609897613525, - "objective/scores": 0.0, - "policy/approxkl_avg": 34.913692474365234, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.646484375, - "step": 374, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9994592666625977 - }, - { - "episode": 6016, - "epoch": 0.036045104312710456, - "loss/policy_avg": 0.13621872663497925, - "lr": 9.760352760736196e-06, - "objective/entropy": 66.13349914550781, - "objective/kl": 35.28841781616211, - "objective/non_score_reward": -1.764420986175537, - "objective/rlhf_reward": -5.324350611368815, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 29.487491607666016, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4521484375, - "step": 375, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0004875659942627 - }, - { - "episode": 6032, - "epoch": 0.036140968951840005, - "loss/policy_avg": 1.1029133796691895, - "lr": 9.759713701431493e-06, - "objective/entropy": 152.87005615234375, - "objective/kl": 33.37676239013672, - "objective/non_score_reward": -1.6688382625579834, - "objective/rlhf_reward": -4.9420197168986, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 139.82919311523438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.80078125, - "step": 376, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985812902450562 - }, - { - "episode": 6048, - "epoch": 0.036236833590969554, - "loss/policy_avg": 0.11807486414909363, - "lr": 9.75907464212679e-06, - "objective/entropy": 217.25425720214844, - "objective/kl": 36.396339416503906, - "objective/non_score_reward": -1.819817066192627, - "objective/rlhf_reward": -9.279268264770508, - "objective/scores": -0.5, - "policy/approxkl_avg": 26.34674835205078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.626953125, - "step": 377, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9962363243103027 - }, - { - "episode": 6064, - "epoch": 0.0363326982300991, - "loss/policy_avg": 0.007799604907631874, - "lr": 9.758435582822087e-06, - "objective/entropy": 101.9019546508789, - "objective/kl": 46.355655670166016, - "objective/non_score_reward": -2.3177828788757324, - "objective/rlhf_reward": -11.27113151550293, - "objective/scores": -0.5, - "policy/approxkl_avg": 68.56651306152344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4853515625, - "step": 378, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000476360321045 - }, - { - "episode": 6080, - "epoch": 0.03642856286922865, - "loss/policy_avg": -0.08341000974178314, - "lr": 9.757796523517384e-06, - "objective/entropy": 89.2676010131836, - "objective/kl": 46.89963912963867, - "objective/non_score_reward": -2.3449819087982178, - "objective/rlhf_reward": -9.37992775440216, - "objective/scores": 0.0, - "policy/approxkl_avg": 182.4227294921875, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.583984375, - "step": 379, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0021252632141113 - }, - { - "episode": 6096, - "epoch": 0.0365244275083582, - "loss/policy_avg": 0.34633713960647583, - "lr": 9.75715746421268e-06, - "objective/entropy": 111.65666198730469, - "objective/kl": 32.17061233520508, - "objective/non_score_reward": -1.6085307598114014, - "objective/rlhf_reward": -6.434122920036316, - "objective/scores": 0.0, - "policy/approxkl_avg": 88.91949462890625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.568359375, - "step": 380, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998034954071045 - }, - { - "episode": 6112, - "epoch": 0.03662029214748775, - "loss/policy_avg": 0.4559730887413025, - "lr": 9.756518404907976e-06, - "objective/entropy": -2.1730079650878906, - "objective/kl": 24.72875213623047, - "objective/non_score_reward": -1.2364375591278076, - "objective/rlhf_reward": -4.945750296115875, - "objective/scores": 0.0, - "policy/approxkl_avg": 28.68434715270996, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6171875, - "step": 381, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0009069442749023 - }, - { - "episode": 6128, - "epoch": 0.0367161567866173, - "loss/policy_avg": 0.255852073431015, - "lr": 9.755879345603273e-06, - "objective/entropy": 189.11827087402344, - "objective/kl": 39.26380157470703, - "objective/non_score_reward": -1.9631900787353516, - "objective/rlhf_reward": -7.852760314941406, - "objective/scores": 0.0, - "policy/approxkl_avg": 43.19601821899414, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.603515625, - "step": 382, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9998410940170288 - }, - { - "episode": 6144, - "epoch": 0.03681202142574685, - "loss/policy_avg": 0.07754311710596085, - "lr": 9.75524028629857e-06, - "objective/entropy": 128.18150329589844, - "objective/kl": 30.46546173095703, - "objective/non_score_reward": -1.523273229598999, - "objective/rlhf_reward": -4.431233172834502, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 10.884984970092773, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.466796875, - "step": 383, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0032522678375244 - }, - { - "episode": 6160, - "epoch": 0.0369078860648764, - "loss/policy_avg": 0.021326124668121338, - "lr": 9.754601226993867e-06, - "objective/entropy": 110.02630615234375, - "objective/kl": 46.91279602050781, - "objective/non_score_reward": -2.345640182495117, - "objective/rlhf_reward": -11.382560729980469, - "objective/scores": -0.5, - "policy/approxkl_avg": 92.21537780761719, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4365234375, - "step": 384, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984705448150635 - }, - { - "episode": 6176, - "epoch": 0.037003750704005946, - "loss/policy_avg": -0.04232418164610863, - "lr": 9.753962167689162e-06, - "objective/entropy": 150.83932495117188, - "objective/kl": 36.70783996582031, - "objective/non_score_reward": -1.8353919982910156, - "objective/rlhf_reward": -7.341568350791931, - "objective/scores": 0.0, - "policy/approxkl_avg": 4.093977928161621, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4365234375, - "step": 385, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.997746467590332 - }, - { - "episode": 6192, - "epoch": 0.037099615343135495, - "loss/policy_avg": -0.19166389107704163, - "lr": 9.753323108384459e-06, - "objective/entropy": 171.91030883789062, - "objective/kl": 34.13121032714844, - "objective/non_score_reward": -1.7065606117248535, - "objective/rlhf_reward": -8.826242446899414, - "objective/scores": -0.5, - "policy/approxkl_avg": 5.414439678192139, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.537109375, - "step": 386, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.011042356491089 - }, - { - "episode": 6208, - "epoch": 0.037195479982265044, - "loss/policy_avg": 0.1640317738056183, - "lr": 9.752684049079756e-06, - "objective/entropy": 48.400291442871094, - "objective/kl": 50.87938690185547, - "objective/non_score_reward": -2.54396915435791, - "objective/rlhf_reward": -8.351048584255288, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 37.014671325683594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5234375, - "step": 387, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9981615543365479 - }, - { - "episode": 6224, - "epoch": 0.03729134462139459, - "loss/policy_avg": 0.3416689932346344, - "lr": 9.752044989775053e-06, - "objective/entropy": 75.96147155761719, - "objective/kl": 41.44689178466797, - "objective/non_score_reward": -2.0723445415496826, - "objective/rlhf_reward": -10.28937816619873, - "objective/scores": -0.5, - "policy/approxkl_avg": 87.05730438232422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62890625, - "step": 388, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997666358947754 - }, - { - "episode": 6240, - "epoch": 0.03738720926052414, - "loss/policy_avg": 0.43460220098495483, - "lr": 9.751405930470348e-06, - "objective/entropy": 77.63334655761719, - "objective/kl": 35.82466125488281, - "objective/non_score_reward": -1.7912328243255615, - "objective/rlhf_reward": -5.805681669448299, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 26.701194763183594, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.3876953125, - "step": 389, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001871109008789 - }, - { - "episode": 6256, - "epoch": 0.03748307389965369, - "loss/policy_avg": -0.16133618354797363, - "lr": 9.750766871165645e-06, - "objective/entropy": 118.75211334228516, - "objective/kl": 38.7314453125, - "objective/non_score_reward": -1.9365723133087158, - "objective/rlhf_reward": -9.746289253234863, - "objective/scores": -0.5, - "policy/approxkl_avg": 171.99203491210938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3916015625, - "step": 390, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9972796440124512 - }, - { - "episode": 6272, - "epoch": 0.03757893853878324, - "loss/policy_avg": 0.3922348618507385, - "lr": 9.750127811860941e-06, - "objective/entropy": 102.3033447265625, - "objective/kl": 32.2958869934082, - "objective/non_score_reward": -1.6147942543029785, - "objective/rlhf_reward": -8.459177017211914, - "objective/scores": -0.5, - "policy/approxkl_avg": 48.286659240722656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.796875, - "step": 391, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978159666061401 - }, - { - "episode": 6288, - "epoch": 0.03767480317791279, - "loss/policy_avg": 0.17723676562309265, - "lr": 9.749488752556238e-06, - "objective/entropy": 22.37232208251953, - "objective/kl": 27.597850799560547, - "objective/non_score_reward": -1.3798925876617432, - "objective/rlhf_reward": -5.519570171833038, - "objective/scores": 0.0, - "policy/approxkl_avg": 56.990116119384766, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4443359375, - "step": 392, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9986002445220947 - }, - { - "episode": 6304, - "epoch": 0.03777066781704234, - "loss/policy_avg": 0.7708048820495605, - "lr": 9.748849693251534e-06, - "objective/entropy": 177.01199340820312, - "objective/kl": 42.79792404174805, - "objective/non_score_reward": -2.1398961544036865, - "objective/rlhf_reward": -4.159584856033325, - "objective/scores": 1.1, - "policy/approxkl_avg": 3.181797504425049, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.693359375, - "step": 393, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0018231868743896 - }, - { - "episode": 6320, - "epoch": 0.037866532456171886, - "loss/policy_avg": 0.34089791774749756, - "lr": 9.74821063394683e-06, - "objective/entropy": 140.12313842773438, - "objective/kl": 46.36503219604492, - "objective/non_score_reward": -2.318251609802246, - "objective/rlhf_reward": -7.894404628363949, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 204.4562225341797, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.412109375, - "step": 394, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0007781982421875 - }, - { - "episode": 6336, - "epoch": 0.037962397095301435, - "loss/policy_avg": 0.16855208575725555, - "lr": 9.747571574642127e-06, - "objective/entropy": 142.73866271972656, - "objective/kl": 39.905517578125, - "objective/non_score_reward": -1.9952759742736816, - "objective/rlhf_reward": -9.981103897094727, - "objective/scores": -0.5, - "policy/approxkl_avg": 18.86113739013672, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4619140625, - "step": 395, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9989800453186035 - }, - { - "episode": 6352, - "epoch": 0.038058261734430984, - "loss/policy_avg": -0.01784345507621765, - "lr": 9.746932515337424e-06, - "objective/entropy": 206.17181396484375, - "objective/kl": 50.423553466796875, - "objective/non_score_reward": -2.5211775302886963, - "objective/rlhf_reward": -12.084710121154785, - "objective/scores": -0.5, - "policy/approxkl_avg": 78.37747192382812, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.64453125, - "step": 396, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0067272186279297 - }, - { - "episode": 6368, - "epoch": 0.03815412637356053, - "loss/policy_avg": -0.05958561599254608, - "lr": 9.746293456032721e-06, - "objective/entropy": 76.2546157836914, - "objective/kl": 21.187191009521484, - "objective/non_score_reward": -1.0593595504760742, - "objective/rlhf_reward": -2.5041048685709635, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 19.284114837646484, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.666015625, - "step": 397, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0066661834716797 - }, - { - "episode": 6384, - "epoch": 0.03824999101269008, - "loss/policy_avg": -0.023946866393089294, - "lr": 9.745654396728016e-06, - "objective/entropy": 62.24019241333008, - "objective/kl": 30.506431579589844, - "objective/non_score_reward": -1.5253217220306396, - "objective/rlhf_reward": -8.101286888122559, - "objective/scores": -0.5, - "policy/approxkl_avg": 56.33515930175781, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.609375, - "step": 398, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000063180923462 - }, - { - "episode": 6400, - "epoch": 0.03834585565181963, - "loss/policy_avg": -0.22125929594039917, - "lr": 9.745015337423313e-06, - "objective/entropy": 106.82333374023438, - "objective/kl": 38.80064392089844, - "objective/non_score_reward": -1.9400321245193481, - "objective/rlhf_reward": -9.76012897491455, - "objective/scores": -0.5, - "policy/approxkl_avg": 26.642780303955078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.662109375, - "step": 399, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998152256011963 - }, - { - "episode": 6416, - "epoch": 0.03844172029094918, - "loss/policy_avg": 1.0083184242248535, - "lr": 9.74437627811861e-06, - "objective/entropy": -8.957130432128906, - "objective/kl": 45.45331573486328, - "objective/non_score_reward": -2.2726659774780273, - "objective/rlhf_reward": -9.09066355228424, - "objective/scores": 0.0, - "policy/approxkl_avg": 336.0928955078125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.453125, - "step": 400, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9965978860855103 - }, - { - "episode": 6432, - "epoch": 0.03853758493007873, - "loss/policy_avg": 0.06534097343683243, - "lr": 9.743737218813907e-06, - "objective/entropy": 43.69670486450195, - "objective/kl": 37.32368469238281, - "objective/non_score_reward": -1.866184115409851, - "objective/rlhf_reward": -9.464736938476562, - "objective/scores": -0.5, - "policy/approxkl_avg": 30.217670440673828, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.578125, - "step": 401, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000572443008423 - }, - { - "episode": 6448, - "epoch": 0.03863344956920828, - "loss/policy_avg": 0.21769657731056213, - "lr": 9.743098159509204e-06, - "objective/entropy": 117.73030090332031, - "objective/kl": 35.21237564086914, - "objective/non_score_reward": -1.7606186866760254, - "objective/rlhf_reward": -9.042474746704102, - "objective/scores": -0.5, - "policy/approxkl_avg": 106.69755554199219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5625, - "step": 402, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000271797180176 - }, - { - "episode": 6464, - "epoch": 0.03872931420833783, - "loss/policy_avg": 0.2948653995990753, - "lr": 9.7424591002045e-06, - "objective/entropy": 122.77405548095703, - "objective/kl": 32.650047302246094, - "objective/non_score_reward": -1.6325020790100098, - "objective/rlhf_reward": -5.204495821028871, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 234.55484008789062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.37890625, - "step": 403, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999098539352417 - }, - { - "episode": 6480, - "epoch": 0.038825178847467376, - "loss/policy_avg": 0.051352113485336304, - "lr": 9.741820040899796e-06, - "objective/entropy": -68.59423065185547, - "objective/kl": 38.657508850097656, - "objective/non_score_reward": -1.932875633239746, - "objective/rlhf_reward": -9.731502532958984, - "objective/scores": -0.5, - "policy/approxkl_avg": 14.453775405883789, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.56640625, - "step": 404, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9975395202636719 - }, - { - "episode": 6496, - "epoch": 0.038921043486596925, - "loss/policy_avg": 0.16383764147758484, - "lr": 9.741180981595093e-06, - "objective/entropy": 180.20687866210938, - "objective/kl": 46.885032653808594, - "objective/non_score_reward": -2.3442516326904297, - "objective/rlhf_reward": -11.377006530761719, - "objective/scores": -0.5, - "policy/approxkl_avg": 36.65456008911133, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.890625, - "step": 405, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9967621564865112 - }, - { - "episode": 6512, - "epoch": 0.039016908125726474, - "loss/policy_avg": 0.13604994118213654, - "lr": 9.74054192229039e-06, - "objective/entropy": 3.9914321899414062, - "objective/kl": 40.065860748291016, - "objective/non_score_reward": -2.003293037414551, - "objective/rlhf_reward": -10.013172149658203, - "objective/scores": -0.5, - "policy/approxkl_avg": 10.099180221557617, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.505859375, - "step": 406, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998462200164795 - }, - { - "episode": 6528, - "epoch": 0.03911277276485602, - "loss/policy_avg": 0.3104326128959656, - "lr": 9.739902862985686e-06, - "objective/entropy": 70.04134368896484, - "objective/kl": 36.113067626953125, - "objective/non_score_reward": -1.8056533336639404, - "objective/rlhf_reward": -5.880977442770629, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 103.19497680664062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.611328125, - "step": 407, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9986114501953125 - }, - { - "episode": 6544, - "epoch": 0.03920863740398557, - "loss/policy_avg": 8.922710418701172, - "lr": 9.739263803680983e-06, - "objective/entropy": 75.4478759765625, - "objective/kl": 50.82036209106445, - "objective/non_score_reward": -2.541018009185791, - "objective/rlhf_reward": -10.164072275161743, - "objective/scores": 0.0, - "policy/approxkl_avg": 78.79092407226562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.591796875, - "step": 408, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998524188995361 - }, - { - "episode": 6560, - "epoch": 0.03930450204311512, - "loss/policy_avg": 0.2832748293876648, - "lr": 9.73862474437628e-06, - "objective/entropy": 109.94105529785156, - "objective/kl": 39.018890380859375, - "objective/non_score_reward": -1.9509445428848267, - "objective/rlhf_reward": -9.803777694702148, - "objective/scores": -0.5, - "policy/approxkl_avg": 85.17427062988281, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3232421875, - "step": 409, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999758005142212 - }, - { - "episode": 6576, - "epoch": 0.03940036668224467, - "loss/policy_avg": 1.1405491828918457, - "lr": 9.737985685071575e-06, - "objective/entropy": 178.7044677734375, - "objective/kl": 34.637672424316406, - "objective/non_score_reward": -1.7318837642669678, - "objective/rlhf_reward": -8.927535057067871, - "objective/scores": -0.5, - "policy/approxkl_avg": 29.027103424072266, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.650390625, - "step": 410, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000344753265381 - }, - { - "episode": 6592, - "epoch": 0.03949623132137422, - "loss/policy_avg": -0.1532156616449356, - "lr": 9.737346625766872e-06, - "objective/entropy": 82.60413360595703, - "objective/kl": 30.524627685546875, - "objective/non_score_reward": -1.5262314081192017, - "objective/rlhf_reward": -6.104925632476807, - "objective/scores": 0.0, - "policy/approxkl_avg": 10.448099136352539, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.494140625, - "step": 411, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0018105506896973 - }, - { - "episode": 6608, - "epoch": 0.03959209596050377, - "loss/policy_avg": 0.17056474089622498, - "lr": 9.736707566462167e-06, - "objective/entropy": 157.149658203125, - "objective/kl": 33.411128997802734, - "objective/non_score_reward": -1.6705564260482788, - "objective/rlhf_reward": -6.682225704193115, - "objective/scores": 0.0, - "policy/approxkl_avg": 184.52642822265625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.72265625, - "step": 412, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9977160692214966 - }, - { - "episode": 6624, - "epoch": 0.039687960599633317, - "loss/policy_avg": 0.25223565101623535, - "lr": 9.736068507157464e-06, - "objective/entropy": 48.83440017700195, - "objective/kl": 31.99204444885254, - "objective/non_score_reward": -1.599602222442627, - "objective/rlhf_reward": -8.398408889770508, - "objective/scores": -0.5, - "policy/approxkl_avg": 53.256534576416016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3837890625, - "step": 413, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9977399110794067 - }, - { - "episode": 6640, - "epoch": 0.039783825238762865, - "loss/policy_avg": 0.1266993284225464, - "lr": 9.735429447852761e-06, - "objective/entropy": 159.01785278320312, - "objective/kl": 31.140743255615234, - "objective/non_score_reward": -1.5570372343063354, - "objective/rlhf_reward": -6.228149056434631, - "objective/scores": 0.0, - "policy/approxkl_avg": 101.76364135742188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4931640625, - "step": 414, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9946467876434326 - }, - { - "episode": 6656, - "epoch": 0.039879689877892414, - "loss/policy_avg": 0.22998680174350739, - "lr": 9.734790388548058e-06, - "objective/entropy": 46.99758529663086, - "objective/kl": 35.97903823852539, - "objective/non_score_reward": -1.7989518642425537, - "objective/rlhf_reward": -9.195807456970215, - "objective/scores": -0.5, - "policy/approxkl_avg": 68.40435791015625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.59375, - "step": 415, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0022130012512207 - }, - { - "episode": 6672, - "epoch": 0.03997555451702196, - "loss/policy_avg": 0.1837829202413559, - "lr": 9.734151329243355e-06, - "objective/entropy": 249.3802490234375, - "objective/kl": 43.223350524902344, - "objective/non_score_reward": -2.1611673831939697, - "objective/rlhf_reward": -8.644670009613037, - "objective/scores": 0.0, - "policy/approxkl_avg": 47.855064392089844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.78125, - "step": 416, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992127418518066 - }, - { - "episode": 6688, - "epoch": 0.04007141915615151, - "loss/policy_avg": 0.035887204110622406, - "lr": 9.73351226993865e-06, - "objective/entropy": 136.988525390625, - "objective/kl": 28.082172393798828, - "objective/non_score_reward": -1.4041086435317993, - "objective/rlhf_reward": -5.616434454917908, - "objective/scores": 0.0, - "policy/approxkl_avg": 85.44422912597656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4560546875, - "step": 417, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0017051696777344 - }, - { - "episode": 6704, - "epoch": 0.04016728379528106, - "loss/policy_avg": 0.5762748718261719, - "lr": 9.732873210633947e-06, - "objective/entropy": 253.0686798095703, - "objective/kl": 40.425331115722656, - "objective/non_score_reward": -2.021266460418701, - "objective/rlhf_reward": -10.085065841674805, - "objective/scores": -0.5, - "policy/approxkl_avg": 64.24309539794922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.783203125, - "step": 418, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9996999502182007 - }, - { - "episode": 6720, - "epoch": 0.04026314843441061, - "loss/policy_avg": 0.2653573751449585, - "lr": 9.732234151329244e-06, - "objective/entropy": 185.62173461914062, - "objective/kl": 48.3837890625, - "objective/non_score_reward": -2.419189453125, - "objective/rlhf_reward": -11.6767578125, - "objective/scores": -0.5, - "policy/approxkl_avg": 84.46121215820312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484375, - "step": 419, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0015792846679688 - }, - { - "episode": 6736, - "epoch": 0.04035901307354016, - "loss/policy_avg": 0.45824360847473145, - "lr": 9.73159509202454e-06, - "objective/entropy": 189.05142211914062, - "objective/kl": 40.93785858154297, - "objective/non_score_reward": -2.0468931198120117, - "objective/rlhf_reward": -8.187572240829468, - "objective/scores": 0.0, - "policy/approxkl_avg": 365.19354248046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.607421875, - "step": 420, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9960243701934814 - }, - { - "episode": 6752, - "epoch": 0.04045487771266971, - "loss/policy_avg": -0.011572149582207203, - "lr": 9.730956032719838e-06, - "objective/entropy": 157.015625, - "objective/kl": 40.52888488769531, - "objective/non_score_reward": -2.026444435119629, - "objective/rlhf_reward": -8.105777025222778, - "objective/scores": 0.0, - "policy/approxkl_avg": 36.79645538330078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4921875, - "step": 421, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.00020170211792 - }, - { - "episode": 6768, - "epoch": 0.04055074235179926, - "loss/policy_avg": 0.09691079705953598, - "lr": 9.730316973415135e-06, - "objective/entropy": 169.43841552734375, - "objective/kl": 38.485565185546875, - "objective/non_score_reward": -1.9242782592773438, - "objective/rlhf_reward": -7.697112798690796, - "objective/scores": 0.0, - "policy/approxkl_avg": 44.24542236328125, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5, - "step": 422, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0003914833068848 - }, - { - "episode": 6784, - "epoch": 0.040646606990928806, - "loss/policy_avg": 0.34911060333251953, - "lr": 9.72967791411043e-06, - "objective/entropy": 208.74990844726562, - "objective/kl": 37.376976013183594, - "objective/non_score_reward": -1.8688490390777588, - "objective/rlhf_reward": -6.11614605161993, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 94.55046844482422, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.607421875, - "step": 423, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0001683235168457 - }, - { - "episode": 6800, - "epoch": 0.040742471630058355, - "loss/policy_avg": 0.04235881194472313, - "lr": 9.729038854805727e-06, - "objective/entropy": -29.72011947631836, - "objective/kl": 40.81784439086914, - "objective/non_score_reward": -2.0408921241760254, - "objective/rlhf_reward": -6.33874034431846, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 160.66685485839844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.68359375, - "step": 424, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9954640865325928 - }, - { - "episode": 6816, - "epoch": 0.040838336269187904, - "loss/policy_avg": 0.24281972646713257, - "lr": 9.728399795501023e-06, - "objective/entropy": 191.4829559326172, - "objective/kl": 36.464630126953125, - "objective/non_score_reward": -1.8232316970825195, - "objective/rlhf_reward": -5.688806984488087, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 22.607797622680664, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.64453125, - "step": 425, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9987616539001465 - }, - { - "episode": 6832, - "epoch": 0.04093420090831745, - "loss/policy_avg": 0.28001827001571655, - "lr": 9.72776073619632e-06, - "objective/entropy": 165.8070526123047, - "objective/kl": 41.63663864135742, - "objective/non_score_reward": -2.081831932067871, - "objective/rlhf_reward": -8.327327966690063, - "objective/scores": 0.0, - "policy/approxkl_avg": 62.481040954589844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6953125, - "step": 426, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999871253967285 - }, - { - "episode": 6848, - "epoch": 0.041030065547447, - "loss/policy_avg": 0.2623136341571808, - "lr": 9.727121676891617e-06, - "objective/entropy": 93.63900756835938, - "objective/kl": 23.993732452392578, - "objective/non_score_reward": -1.1996865272521973, - "objective/rlhf_reward": -3.28297432640427, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 19.302370071411133, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.591796875, - "step": 427, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0000486373901367 - }, - { - "episode": 6864, - "epoch": 0.04112593018657655, - "loss/policy_avg": 0.2426406443119049, - "lr": 9.726482617586912e-06, - "objective/entropy": 113.51960754394531, - "objective/kl": 37.362518310546875, - "objective/non_score_reward": -1.8681257963180542, - "objective/rlhf_reward": -9.472503662109375, - "objective/scores": -0.5, - "policy/approxkl_avg": 50.487884521484375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.361328125, - "step": 428, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9970954656600952 - }, - { - "episode": 6880, - "epoch": 0.0412217948257061, - "loss/policy_avg": 0.8704113960266113, - "lr": 9.72584355828221e-06, - "objective/entropy": 95.84138488769531, - "objective/kl": 36.00267028808594, - "objective/non_score_reward": -1.800133466720581, - "objective/rlhf_reward": -5.37570517805488, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 108.54400634765625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.65625, - "step": 429, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9939507246017456 - }, - { - "episode": 6896, - "epoch": 0.04131765946483565, - "loss/policy_avg": 0.4814898669719696, - "lr": 9.725204498977506e-06, - "objective/entropy": 225.97140502929688, - "objective/kl": 44.52623748779297, - "objective/non_score_reward": -2.2263121604919434, - "objective/rlhf_reward": -7.348988621440485, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 30.636560440063477, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6640625, - "step": 430, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9988288879394531 - }, - { - "episode": 6912, - "epoch": 0.0414135241039652, - "loss/policy_avg": -0.3788025379180908, - "lr": 9.724565439672803e-06, - "objective/entropy": 173.88014221191406, - "objective/kl": 57.236175537109375, - "objective/non_score_reward": -2.8618087768554688, - "objective/rlhf_reward": -13.447235107421875, - "objective/scores": -0.5, - "policy/approxkl_avg": 28.293506622314453, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.552734375, - "step": 431, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000931739807129 - }, - { - "episode": 6928, - "epoch": 0.041509388743094754, - "loss/policy_avg": 0.2732902765274048, - "lr": 9.7239263803681e-06, - "objective/entropy": 109.51008605957031, - "objective/kl": 37.86288833618164, - "objective/non_score_reward": -1.8931443691253662, - "objective/rlhf_reward": -5.449871124998603, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 80.19667053222656, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.328125, - "step": 432, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9976387023925781 - }, - { - "episode": 6944, - "epoch": 0.0416052533822243, - "loss/policy_avg": 0.27860450744628906, - "lr": 9.723287321063397e-06, - "objective/entropy": 36.931854248046875, - "objective/kl": 32.6243896484375, - "objective/non_score_reward": -1.6312193870544434, - "objective/rlhf_reward": -8.524877548217773, - "objective/scores": -0.5, - "policy/approxkl_avg": 26.86789894104004, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.517578125, - "step": 433, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9978294372558594 - }, - { - "episode": 6960, - "epoch": 0.04170111802135385, - "loss/policy_avg": 0.5596253275871277, - "lr": 9.722648261758692e-06, - "objective/entropy": 199.27142333984375, - "objective/kl": 44.00600051879883, - "objective/non_score_reward": -2.2003002166748047, - "objective/rlhf_reward": -7.350602249713287, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 53.96643829345703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.580078125, - "step": 434, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.001927614212036 - }, - { - "episode": 6976, - "epoch": 0.0417969826604834, - "loss/policy_avg": 0.7040017247200012, - "lr": 9.722009202453989e-06, - "objective/entropy": 170.60586547851562, - "objective/kl": 41.97274398803711, - "objective/non_score_reward": -2.098637104034424, - "objective/rlhf_reward": -5.470829759479734, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 91.50270080566406, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.546875, - "step": 435, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999673843383789 - }, - { - "episode": 6992, - "epoch": 0.04189284729961295, - "loss/policy_avg": 0.3991093337535858, - "lr": 9.721370143149284e-06, - "objective/entropy": 180.92874145507812, - "objective/kl": 38.3790283203125, - "objective/non_score_reward": -1.9189512729644775, - "objective/rlhf_reward": -5.942471877733866, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 3.5480709075927734, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.55078125, - "step": 436, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.001030683517456 - }, - { - "episode": 7008, - "epoch": 0.0419887119387425, - "loss/policy_avg": 0.2966272234916687, - "lr": 9.720731083844581e-06, - "objective/entropy": 142.98663330078125, - "objective/kl": 30.8831729888916, - "objective/non_score_reward": -1.5441588163375854, - "objective/rlhf_reward": -1.776635146141052, - "objective/scores": 1.1, - "policy/approxkl_avg": 19.243196487426758, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4873046875, - "step": 437, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0003151893615723 - }, - { - "episode": 7024, - "epoch": 0.04208457657787205, - "loss/policy_avg": 0.22927549481391907, - "lr": 9.720092024539878e-06, - "objective/entropy": -2.2561073303222656, - "objective/kl": 46.941993713378906, - "objective/non_score_reward": -2.347099781036377, - "objective/rlhf_reward": -7.784278903071003, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 284.0980224609375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.40625, - "step": 438, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9965739250183105 - }, - { - "episode": 7040, - "epoch": 0.042180441217001596, - "loss/policy_avg": -0.37906330823898315, - "lr": 9.719452965235175e-06, - "objective/entropy": 190.4355010986328, - "objective/kl": 46.711570739746094, - "objective/non_score_reward": -2.3355789184570312, - "objective/rlhf_reward": -6.942314958572387, - "objective/scores": 0.6, - "policy/approxkl_avg": 124.1642074584961, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.59765625, - "step": 439, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0039045810699463 - }, - { - "episode": 7056, - "epoch": 0.042276305856131145, - "loss/policy_avg": 0.2566912770271301, - "lr": 9.718813905930472e-06, - "objective/entropy": 125.67393493652344, - "objective/kl": 41.860557556152344, - "objective/non_score_reward": -2.0930278301239014, - "objective/rlhf_reward": -8.372111141681671, - "objective/scores": 0.0, - "policy/approxkl_avg": 13.688385009765625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3818359375, - "step": 440, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9996503591537476 - }, - { - "episode": 7072, - "epoch": 0.042372170495260694, - "loss/policy_avg": 0.8826879262924194, - "lr": 9.718174846625767e-06, - "objective/entropy": 151.81240844726562, - "objective/kl": 46.84815979003906, - "objective/non_score_reward": -2.342407703399658, - "objective/rlhf_reward": -9.369631052017212, - "objective/scores": 0.0, - "policy/approxkl_avg": 268.04595947265625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.646484375, - "step": 441, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988102912902832 - }, - { - "episode": 7088, - "epoch": 0.04246803513439024, - "loss/policy_avg": -0.02791355550289154, - "lr": 9.717535787321064e-06, - "objective/entropy": 51.914703369140625, - "objective/kl": 49.47495651245117, - "objective/non_score_reward": -2.473747968673706, - "objective/rlhf_reward": -11.894991874694824, - "objective/scores": -0.5, - "policy/approxkl_avg": 68.78294372558594, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.552734375, - "step": 442, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.00296950340271 - }, - { - "episode": 7104, - "epoch": 0.04256389977351979, - "loss/policy_avg": 0.2161749303340912, - "lr": 9.71689672801636e-06, - "objective/entropy": 155.6175079345703, - "objective/kl": 33.92812728881836, - "objective/non_score_reward": -1.696406364440918, - "objective/rlhf_reward": -5.229366033282831, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 162.97238159179688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.474609375, - "step": 443, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9964041709899902 - }, - { - "episode": 7120, - "epoch": 0.04265976441264934, - "loss/policy_avg": 1.1079907417297363, - "lr": 9.716257668711657e-06, - "objective/entropy": 228.69253540039062, - "objective/kl": 41.69932556152344, - "objective/non_score_reward": -2.0849664211273193, - "objective/rlhf_reward": -10.339865684509277, - "objective/scores": -0.5, - "policy/approxkl_avg": 299.07171630859375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.689453125, - "step": 444, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000746726989746 - }, - { - "episode": 7136, - "epoch": 0.04275562905177889, - "loss/policy_avg": -0.18060393631458282, - "lr": 9.715618609406954e-06, - "objective/entropy": 116.65229034423828, - "objective/kl": 30.7730712890625, - "objective/non_score_reward": -1.5386537313461304, - "objective/rlhf_reward": -4.843933692178131, - "objective/scores": 0.327670248696953, - "policy/approxkl_avg": 3.5278494358062744, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.6953125, - "step": 445, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0028085708618164 - }, - { - "episode": 7152, - "epoch": 0.04285149369090844, - "loss/policy_avg": 0.06034235656261444, - "lr": 9.714979550102251e-06, - "objective/entropy": 83.11370849609375, - "objective/kl": 48.45117950439453, - "objective/non_score_reward": -2.4225587844848633, - "objective/rlhf_reward": -11.690235137939453, - "objective/scores": -0.5, - "policy/approxkl_avg": 19.12924575805664, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.619140625, - "step": 446, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9970791339874268 - }, - { - "episode": 7168, - "epoch": 0.04294735833003799, - "loss/policy_avg": 2.49505352973938, - "lr": 9.714340490797546e-06, - "objective/entropy": -52.68606185913086, - "objective/kl": 40.133392333984375, - "objective/non_score_reward": -2.006669521331787, - "objective/rlhf_reward": -8.026678442955017, - "objective/scores": 0.0, - "policy/approxkl_avg": 8.131301879882812, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.462890625, - "step": 447, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0003855228424072 - }, - { - "episode": 7184, - "epoch": 0.04304322296916754, - "loss/policy_avg": 0.5659292936325073, - "lr": 9.713701431492843e-06, - "objective/entropy": 182.43234252929688, - "objective/kl": 42.25321578979492, - "objective/non_score_reward": -2.1126608848571777, - "objective/rlhf_reward": -10.450643539428711, - "objective/scores": -0.5, - "policy/approxkl_avg": 101.77164459228516, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.560546875, - "step": 448, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9980123043060303 - }, - { - "episode": 7200, - "epoch": 0.043139087608297086, - "loss/policy_avg": 0.0843241959810257, - "lr": 9.71306237218814e-06, - "objective/entropy": 245.69744873046875, - "objective/kl": 29.222219467163086, - "objective/non_score_reward": -1.4611108303070068, - "objective/rlhf_reward": -5.844443321228027, - "objective/scores": 0.0, - "policy/approxkl_avg": 54.68086242675781, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.744140625, - "step": 449, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9981093406677246 - }, - { - "episode": 7216, - "epoch": 0.043234952247426635, - "loss/policy_avg": 0.09573544561862946, - "lr": 9.712423312883437e-06, - "objective/entropy": 93.39710998535156, - "objective/kl": 36.68029022216797, - "objective/non_score_reward": -1.8340145349502563, - "objective/rlhf_reward": -5.511229510578226, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 11.1849365234375, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.2919921875, - "step": 450, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0029678344726562 - }, - { - "episode": 7232, - "epoch": 0.043330816886556184, - "loss/policy_avg": 0.7713199853897095, - "lr": 9.711784253578734e-06, - "objective/entropy": 172.16571044921875, - "objective/kl": 45.20444107055664, - "objective/non_score_reward": -2.2602221965789795, - "objective/rlhf_reward": -7.662286260215145, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 202.79624938964844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.546875, - "step": 451, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0003104209899902 - }, - { - "episode": 7248, - "epoch": 0.04342668152568573, - "loss/policy_avg": 0.33446595072746277, - "lr": 9.711145194274029e-06, - "objective/entropy": 200.54441833496094, - "objective/kl": 43.61439514160156, - "objective/non_score_reward": -2.1807198524475098, - "objective/rlhf_reward": -8.72287917137146, - "objective/scores": 0.0, - "policy/approxkl_avg": 175.7442626953125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.60546875, - "step": 452, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.998628854751587 - }, - { - "episode": 7264, - "epoch": 0.04352254616481528, - "loss/policy_avg": 0.38772690296173096, - "lr": 9.710506134969326e-06, - "objective/entropy": 41.2994384765625, - "objective/kl": 44.39094543457031, - "objective/non_score_reward": -2.2195472717285156, - "objective/rlhf_reward": -10.878189086914062, - "objective/scores": -0.5, - "policy/approxkl_avg": 134.41238403320312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.544921875, - "step": 453, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9978395700454712 - }, - { - "episode": 7280, - "epoch": 0.04361841080394483, - "loss/policy_avg": 0.6301360130310059, - "lr": 9.709867075664623e-06, - "objective/entropy": 107.89152526855469, - "objective/kl": 38.03111267089844, - "objective/non_score_reward": -1.9015557765960693, - "objective/rlhf_reward": -9.606223106384277, - "objective/scores": -0.5, - "policy/approxkl_avg": 58.21129608154297, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.54296875, - "step": 454, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9977179765701294 - }, - { - "episode": 7296, - "epoch": 0.04371427544307438, - "loss/policy_avg": -0.06993488222360611, - "lr": 9.70922801635992e-06, - "objective/entropy": 212.8966827392578, - "objective/kl": 27.732505798339844, - "objective/non_score_reward": -1.3866255283355713, - "objective/rlhf_reward": -7.546501636505127, - "objective/scores": -0.5, - "policy/approxkl_avg": 1.7735340595245361, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6328125, - "step": 455, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.002204179763794 - }, - { - "episode": 7312, - "epoch": 0.04381014008220393, - "loss/policy_avg": 1.5585349798202515, - "lr": 9.708588957055215e-06, - "objective/entropy": 8.907943725585938, - "objective/kl": 46.3431510925293, - "objective/non_score_reward": -2.317157506942749, - "objective/rlhf_reward": -9.268629848957062, - "objective/scores": 0.0, - "policy/approxkl_avg": 82.92784118652344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4375, - "step": 456, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9991109371185303 - }, - { - "episode": 7328, - "epoch": 0.04390600472133348, - "loss/policy_avg": 0.46995729207992554, - "lr": 9.707949897750512e-06, - "objective/entropy": 2.804157257080078, - "objective/kl": 24.88430404663086, - "objective/non_score_reward": -1.2442151308059692, - "objective/rlhf_reward": -2.8541544101395946, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 16.516504287719727, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.615234375, - "step": 457, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9996424913406372 - }, - { - "episode": 7344, - "epoch": 0.044001869360463026, - "loss/policy_avg": 1.0241265296936035, - "lr": 9.707310838445809e-06, - "objective/entropy": 261.0165100097656, - "objective/kl": 39.613746643066406, - "objective/non_score_reward": -1.9806873798370361, - "objective/rlhf_reward": -7.922749698162079, - "objective/scores": 0.0, - "policy/approxkl_avg": 15.124380111694336, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.810546875, - "step": 458, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9987176656723022 - }, - { - "episode": 7360, - "epoch": 0.044097733999592575, - "loss/policy_avg": 0.18850752711296082, - "lr": 9.706671779141105e-06, - "objective/entropy": 123.50984191894531, - "objective/kl": 45.86781692504883, - "objective/non_score_reward": -2.2933907508850098, - "objective/rlhf_reward": -4.773563241958618, - "objective/scores": 1.1, - "policy/approxkl_avg": 22.56751251220703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.513671875, - "step": 459, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000584125518799 - }, - { - "episode": 7376, - "epoch": 0.044193598638722124, - "loss/policy_avg": 0.3746855854988098, - "lr": 9.7060327198364e-06, - "objective/entropy": 146.0401611328125, - "objective/kl": 38.55249786376953, - "objective/non_score_reward": -1.9276249408721924, - "objective/rlhf_reward": -7.71049952507019, - "objective/scores": 0.0, - "policy/approxkl_avg": 211.50054931640625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.732421875, - "step": 460, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9972167015075684 - }, - { - "episode": 7392, - "epoch": 0.04428946327785167, - "loss/policy_avg": 0.5910313725471497, - "lr": 9.705393660531698e-06, - "objective/entropy": 198.9757080078125, - "objective/kl": 37.61479187011719, - "objective/non_score_reward": -1.880739450454712, - "objective/rlhf_reward": -9.522957801818848, - "objective/scores": -0.5, - "policy/approxkl_avg": 204.2825469970703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.619140625, - "step": 461, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9974172115325928 - }, - { - "episode": 7408, - "epoch": 0.04438532791698122, - "loss/policy_avg": -0.04315639287233353, - "lr": 9.704754601226994e-06, - "objective/entropy": 156.10818481445312, - "objective/kl": 26.727436065673828, - "objective/non_score_reward": -1.336371660232544, - "objective/rlhf_reward": -7.345486640930176, - "objective/scores": -0.5, - "policy/approxkl_avg": 40.13689422607422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.501953125, - "step": 462, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999086856842041 - }, - { - "episode": 7424, - "epoch": 0.04448119255611077, - "loss/policy_avg": 0.11912831664085388, - "lr": 9.704115541922291e-06, - "objective/entropy": 153.92323303222656, - "objective/kl": 29.754222869873047, - "objective/non_score_reward": -1.4877111911773682, - "objective/rlhf_reward": -7.950844764709473, - "objective/scores": -0.5, - "policy/approxkl_avg": 30.665206909179688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.763671875, - "step": 463, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983062744140625 - }, - { - "episode": 7440, - "epoch": 0.04457705719524032, - "loss/policy_avg": 0.34447717666625977, - "lr": 9.703476482617588e-06, - "objective/entropy": 20.401763916015625, - "objective/kl": 36.71702575683594, - "objective/non_score_reward": -1.8358510732650757, - "objective/rlhf_reward": -7.343404173851013, - "objective/scores": 0.0, - "policy/approxkl_avg": 50.746131896972656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.505859375, - "step": 464, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9996228218078613 - }, - { - "episode": 7456, - "epoch": 0.04467292183436987, - "loss/policy_avg": 0.05298962816596031, - "lr": 9.702837423312883e-06, - "objective/entropy": 107.51678466796875, - "objective/kl": 30.20279312133789, - "objective/non_score_reward": -1.5101397037506104, - "objective/rlhf_reward": -8.040558815002441, - "objective/scores": -0.5, - "policy/approxkl_avg": 6.385878086090088, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.595703125, - "step": 465, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9973207712173462 - }, - { - "episode": 7472, - "epoch": 0.04476878647349942, - "loss/policy_avg": 0.47295618057250977, - "lr": 9.70219836400818e-06, - "objective/entropy": 186.91851806640625, - "objective/kl": 39.759700775146484, - "objective/non_score_reward": -1.9879851341247559, - "objective/rlhf_reward": -6.347820553843098, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 6.087001800537109, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.556640625, - "step": 466, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9997063875198364 - }, - { - "episode": 7488, - "epoch": 0.04486465111262897, - "loss/policy_avg": 0.3874097764492035, - "lr": 9.701559304703477e-06, - "objective/entropy": 173.0869903564453, - "objective/kl": 43.96525192260742, - "objective/non_score_reward": -2.1982626914978027, - "objective/rlhf_reward": -7.414448239890438, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 85.4808349609375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.53515625, - "step": 467, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9984729290008545 - }, - { - "episode": 7504, - "epoch": 0.044960515751758516, - "loss/policy_avg": -0.2668102979660034, - "lr": 9.700920245398774e-06, - "objective/entropy": 71.85359191894531, - "objective/kl": 43.253211975097656, - "objective/non_score_reward": -2.162660598754883, - "objective/rlhf_reward": -10.650642395019531, - "objective/scores": -0.5, - "policy/approxkl_avg": 10.98958969116211, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.564453125, - "step": 468, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.006080389022827 - }, - { - "episode": 7520, - "epoch": 0.045056380390888065, - "loss/policy_avg": 0.3569624722003937, - "lr": 9.700281186094071e-06, - "objective/entropy": 189.16136169433594, - "objective/kl": 40.66926956176758, - "objective/non_score_reward": -2.033463478088379, - "objective/rlhf_reward": -8.133854150772095, - "objective/scores": 0.0, - "policy/approxkl_avg": 19.30429458618164, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.63671875, - "step": 469, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.99971604347229 - }, - { - "episode": 7536, - "epoch": 0.045152245030017614, - "loss/policy_avg": 0.37975427508354187, - "lr": 9.699642126789368e-06, - "objective/entropy": 38.9386100769043, - "objective/kl": 35.93472671508789, - "objective/non_score_reward": -1.796736478805542, - "objective/rlhf_reward": -5.736347894282684, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 93.73966979980469, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.57421875, - "step": 470, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998188853263855 - }, - { - "episode": 7552, - "epoch": 0.04524810966914716, - "loss/policy_avg": 0.2126319706439972, - "lr": 9.699003067484663e-06, - "objective/entropy": 14.602066040039062, - "objective/kl": 43.82414245605469, - "objective/non_score_reward": -2.19120717048645, - "objective/rlhf_reward": -10.764827728271484, - "objective/scores": -0.5, - "policy/approxkl_avg": 75.86665344238281, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4833984375, - "step": 471, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.005751132965088 - }, - { - "episode": 7568, - "epoch": 0.04534397430827671, - "loss/policy_avg": 0.5844190120697021, - "lr": 9.69836400817996e-06, - "objective/entropy": 84.61041259765625, - "objective/kl": 51.00375747680664, - "objective/non_score_reward": -2.5501880645751953, - "objective/rlhf_reward": -12.200752258300781, - "objective/scores": -0.5, - "policy/approxkl_avg": 147.34310913085938, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.44921875, - "step": 472, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0002200603485107 - }, - { - "episode": 7584, - "epoch": 0.04543983894740626, - "loss/policy_avg": -0.19812732934951782, - "lr": 9.697724948875257e-06, - "objective/entropy": 59.431541442871094, - "objective/kl": 42.676483154296875, - "objective/non_score_reward": -2.133824110031128, - "objective/rlhf_reward": -8.53529667854309, - "objective/scores": 0.0, - "policy/approxkl_avg": 25.93294906616211, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5625, - "step": 473, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001767873764038 - }, - { - "episode": 7600, - "epoch": 0.04553570358653581, - "loss/policy_avg": -0.1512744426727295, - "lr": 9.697085889570554e-06, - "objective/entropy": 34.99419403076172, - "objective/kl": 40.56854248046875, - "objective/non_score_reward": -2.0284271240234375, - "objective/rlhf_reward": -8.113708853721619, - "objective/scores": 0.0, - "policy/approxkl_avg": 4.354091644287109, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.509765625, - "step": 474, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0055007934570312 - }, - { - "episode": 7616, - "epoch": 0.04563156822566536, - "loss/policy_avg": 0.5833158493041992, - "lr": 9.69644683026585e-06, - "objective/entropy": 106.2252197265625, - "objective/kl": 37.68205261230469, - "objective/non_score_reward": -1.8841025829315186, - "objective/rlhf_reward": -5.980151384082392, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 22.509538650512695, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.623046875, - "step": 475, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9982876777648926 - }, - { - "episode": 7632, - "epoch": 0.04572743286479491, - "loss/policy_avg": 0.01958562433719635, - "lr": 9.695807770961146e-06, - "objective/entropy": 121.47393035888672, - "objective/kl": 48.13959503173828, - "objective/non_score_reward": -2.40697979927063, - "objective/rlhf_reward": -8.11214765289658, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 75.11579895019531, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.44140625, - "step": 476, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999737024307251 - }, - { - "episode": 7648, - "epoch": 0.04582329750392446, - "loss/policy_avg": 0.6820676326751709, - "lr": 9.695168711656443e-06, - "objective/entropy": 115.61346435546875, - "objective/kl": 37.32343292236328, - "objective/non_score_reward": -1.8661715984344482, - "objective/rlhf_reward": -9.464686393737793, - "objective/scores": -0.5, - "policy/approxkl_avg": 17.16072654724121, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53515625, - "step": 477, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000274419784546 - }, - { - "episode": 7664, - "epoch": 0.045919162143054006, - "loss/policy_avg": 0.40561172366142273, - "lr": 9.694529652351738e-06, - "objective/entropy": 90.63370513916016, - "objective/kl": 37.22323989868164, - "objective/non_score_reward": -1.8611619472503662, - "objective/rlhf_reward": -5.9940495296434015, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 285.416259765625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.662109375, - "step": 478, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9959321022033691 - }, - { - "episode": 7680, - "epoch": 0.046015026782183555, - "loss/policy_avg": 0.17466656863689423, - "lr": 9.693890593047035e-06, - "objective/entropy": 54.67130661010742, - "objective/kl": 41.81562805175781, - "objective/non_score_reward": -2.0907812118530273, - "objective/rlhf_reward": -8.363124966621399, - "objective/scores": 0.0, - "policy/approxkl_avg": 3.4145569801330566, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.59375, - "step": 479, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9975910186767578 - }, - { - "episode": 7696, - "epoch": 0.046110891421313104, - "loss/policy_avg": 0.4119042158126831, - "lr": 9.693251533742331e-06, - "objective/entropy": 92.60330963134766, - "objective/kl": 35.919769287109375, - "objective/non_score_reward": -1.7959884405136108, - "objective/rlhf_reward": -5.45062030951182, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 57.694190979003906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.279296875, - "step": 480, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0003273487091064 - }, - { - "episode": 7712, - "epoch": 0.04620675606044265, - "loss/policy_avg": 0.06249671056866646, - "lr": 9.692612474437628e-06, - "objective/entropy": 102.40875244140625, - "objective/kl": 40.75782775878906, - "objective/non_score_reward": -2.037891387939453, - "objective/rlhf_reward": -10.151565551757812, - "objective/scores": -0.5, - "policy/approxkl_avg": 141.67388916015625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.705078125, - "step": 481, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0004329681396484 - }, - { - "episode": 7728, - "epoch": 0.0463026206995722, - "loss/policy_avg": 0.3060109615325928, - "lr": 9.691973415132925e-06, - "objective/entropy": 149.1110076904297, - "objective/kl": 42.07062530517578, - "objective/non_score_reward": -2.1035311222076416, - "objective/rlhf_reward": -10.414125442504883, - "objective/scores": -0.5, - "policy/approxkl_avg": 73.67881774902344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.630859375, - "step": 482, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.995924472808838 - }, - { - "episode": 7744, - "epoch": 0.04639848533870175, - "loss/policy_avg": 0.7175555229187012, - "lr": 9.691334355828222e-06, - "objective/entropy": 2.7473793029785156, - "objective/kl": 40.123050689697266, - "objective/non_score_reward": -2.006152629852295, - "objective/rlhf_reward": -8.024610042572021, - "objective/scores": 0.0, - "policy/approxkl_avg": 9.62218952178955, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.52734375, - "step": 483, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9981098175048828 - }, - { - "episode": 7760, - "epoch": 0.0464943499778313, - "loss/policy_avg": 0.19902104139328003, - "lr": 9.690695296523517e-06, - "objective/entropy": 18.798885345458984, - "objective/kl": 38.83830261230469, - "objective/non_score_reward": -1.9419152736663818, - "objective/rlhf_reward": -9.767661094665527, - "objective/scores": -0.5, - "policy/approxkl_avg": 32.263641357421875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.572265625, - "step": 484, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000005006790161 - }, - { - "episode": 7776, - "epoch": 0.04659021461696085, - "loss/policy_avg": 0.11375686526298523, - "lr": 9.690056237218814e-06, - "objective/entropy": 132.22962951660156, - "objective/kl": 34.968589782714844, - "objective/non_score_reward": -1.748429536819458, - "objective/rlhf_reward": -8.993717193603516, - "objective/scores": -0.5, - "policy/approxkl_avg": 121.77729797363281, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.55859375, - "step": 485, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999431848526001 - }, - { - "episode": 7792, - "epoch": 0.0466860792560904, - "loss/policy_avg": 0.47958219051361084, - "lr": 9.689417177914111e-06, - "objective/entropy": 35.028343200683594, - "objective/kl": 35.37997055053711, - "objective/non_score_reward": -1.7689985036849976, - "objective/rlhf_reward": -7.07599413394928, - "objective/scores": 0.0, - "policy/approxkl_avg": 24.928985595703125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.529296875, - "step": 486, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9948756694793701 - }, - { - "episode": 7808, - "epoch": 0.046781943895219946, - "loss/policy_avg": 0.09556100517511368, - "lr": 9.688778118609408e-06, - "objective/entropy": 148.95831298828125, - "objective/kl": 35.879150390625, - "objective/non_score_reward": -1.7939574718475342, - "objective/rlhf_reward": -5.797227957335812, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 71.48307037353516, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4384765625, - "step": 487, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.004147529602051 - }, - { - "episode": 7824, - "epoch": 0.046877808534349495, - "loss/policy_avg": 0.5969531536102295, - "lr": 9.688139059304705e-06, - "objective/entropy": 153.7490234375, - "objective/kl": 48.861671447753906, - "objective/non_score_reward": -2.4430835247039795, - "objective/rlhf_reward": -8.372334098815918, - "objective/scores": 0.35, - "policy/approxkl_avg": 9.396366119384766, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.501953125, - "step": 488, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984536170959473 - }, - { - "episode": 7840, - "epoch": 0.046973673173479044, - "loss/policy_avg": 0.1210302859544754, - "lr": 9.6875e-06, - "objective/entropy": 49.37278366088867, - "objective/kl": 35.451751708984375, - "objective/non_score_reward": -1.7725876569747925, - "objective/rlhf_reward": -9.090351104736328, - "objective/scores": -0.5, - "policy/approxkl_avg": 88.43473815917969, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.544921875, - "step": 489, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.997957706451416 - }, - { - "episode": 7856, - "epoch": 0.0470695378126086, - "loss/policy_avg": 0.08776310086250305, - "lr": 9.686860940695297e-06, - "objective/entropy": 47.7550048828125, - "objective/kl": 41.63129425048828, - "objective/non_score_reward": -2.0815649032592773, - "objective/rlhf_reward": -8.32625961303711, - "objective/scores": 0.0, - "policy/approxkl_avg": 38.46742248535156, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.39453125, - "step": 490, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998126745223999 - }, - { - "episode": 7872, - "epoch": 0.04716540245173815, - "loss/policy_avg": -0.2399851381778717, - "lr": 9.686221881390594e-06, - "objective/entropy": 16.01573944091797, - "objective/kl": 30.493085861206055, - "objective/non_score_reward": -1.5246543884277344, - "objective/rlhf_reward": -4.674785335262385, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 26.804075241088867, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4716796875, - "step": 491, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9998912811279297 - }, - { - "episode": 7888, - "epoch": 0.0472612670908677, - "loss/policy_avg": 0.13263994455337524, - "lr": 9.68558282208589e-06, - "objective/entropy": 140.36886596679688, - "objective/kl": 35.053714752197266, - "objective/non_score_reward": -1.752685546875, - "objective/rlhf_reward": -9.0107421875, - "objective/scores": -0.5, - "policy/approxkl_avg": 41.00921630859375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.701171875, - "step": 492, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9989254474639893 - }, - { - "episode": 7904, - "epoch": 0.04735713172999725, - "loss/policy_avg": 0.03523946925997734, - "lr": 9.684943762781188e-06, - "objective/entropy": 160.34219360351562, - "objective/kl": 34.51702880859375, - "objective/non_score_reward": -1.72585129737854, - "objective/rlhf_reward": -8.903406143188477, - "objective/scores": -0.5, - "policy/approxkl_avg": 0.8806684613227844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.595703125, - "step": 493, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0012478828430176 - }, - { - "episode": 7920, - "epoch": 0.047452996369126796, - "loss/policy_avg": 0.911601185798645, - "lr": 9.684304703476484e-06, - "objective/entropy": 2.773548126220703, - "objective/kl": 37.37300109863281, - "objective/non_score_reward": -1.868650197982788, - "objective/rlhf_reward": -9.474600791931152, - "objective/scores": -0.5, - "policy/approxkl_avg": 14.019964218139648, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.69140625, - "step": 494, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992573261260986 - }, - { - "episode": 7936, - "epoch": 0.047548861008256345, - "loss/policy_avg": 0.34348466992378235, - "lr": 9.68366564417178e-06, - "objective/entropy": 121.98845672607422, - "objective/kl": 24.834556579589844, - "objective/non_score_reward": -1.2417279481887817, - "objective/rlhf_reward": -0.566911673545837, - "objective/scores": 1.1, - "policy/approxkl_avg": 22.85826301574707, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.572265625, - "step": 495, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001675605773926 - }, - { - "episode": 7952, - "epoch": 0.047644725647385894, - "loss/policy_avg": 0.10158610343933105, - "lr": 9.683026584867076e-06, - "objective/entropy": 135.89080810546875, - "objective/kl": 33.785274505615234, - "objective/non_score_reward": -1.6892638206481934, - "objective/rlhf_reward": -6.757055282592773, - "objective/scores": 0.0, - "policy/approxkl_avg": 60.38434600830078, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.51953125, - "step": 496, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9961328506469727 - }, - { - "episode": 7968, - "epoch": 0.04774059028651544, - "loss/policy_avg": 0.23412325978279114, - "lr": 9.682387525562373e-06, - "objective/entropy": 125.34407806396484, - "objective/kl": 30.21947479248047, - "objective/non_score_reward": -1.5109736919403076, - "objective/rlhf_reward": -4.7332141306012865, - "objective/scores": 0.327670248696953, - "policy/approxkl_avg": 55.509063720703125, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.66015625, - "step": 497, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988863468170166 - }, - { - "episode": 7984, - "epoch": 0.04783645492564499, - "loss/policy_avg": 0.19422444701194763, - "lr": 9.68174846625767e-06, - "objective/entropy": -99.93417358398438, - "objective/kl": 36.70722579956055, - "objective/non_score_reward": -1.8353612422943115, - "objective/rlhf_reward": -9.341445922851562, - "objective/scores": -0.5, - "policy/approxkl_avg": 62.86932373046875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5234375, - "step": 498, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9983272552490234 - }, - { - "episode": 8000, - "epoch": 0.04793231956477454, - "loss/policy_avg": 0.7712575197219849, - "lr": 9.681109406952967e-06, - "objective/entropy": 185.83950805664062, - "objective/kl": 40.12968063354492, - "objective/non_score_reward": -2.006484031677246, - "objective/rlhf_reward": -6.078524629549916, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 107.38670349121094, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.56640625, - "step": 499, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9978251457214355 - }, - { - "episode": 8016, - "epoch": 0.04802818420390409, - "loss/policy_avg": 0.048789143562316895, - "lr": 9.680470347648262e-06, - "objective/entropy": 80.77729797363281, - "objective/kl": 43.95686340332031, - "objective/non_score_reward": -2.197843313217163, - "objective/rlhf_reward": -7.2756017086827125, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 23.518718719482422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.583984375, - "step": 500, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0010409355163574 - }, - { - "episode": 8032, - "epoch": 0.04812404884303364, - "loss/policy_avg": 1.3080418109893799, - "lr": 9.67983128834356e-06, - "objective/entropy": 25.42633628845215, - "objective/kl": 34.060516357421875, - "objective/non_score_reward": -1.7030255794525146, - "objective/rlhf_reward": -5.207982692782002, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 10.431194305419922, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.556640625, - "step": 501, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000088930130005 - }, - { - "episode": 8048, - "epoch": 0.04821991348216319, - "loss/policy_avg": 0.543501615524292, - "lr": 9.679192229038854e-06, - "objective/entropy": 95.14369201660156, - "objective/kl": 32.53972625732422, - "objective/non_score_reward": -1.6269863843917847, - "objective/rlhf_reward": -8.50794506072998, - "objective/scores": -0.5, - "policy/approxkl_avg": 11.882524490356445, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.765625, - "step": 502, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998069167137146 - }, - { - "episode": 8064, - "epoch": 0.048315778121292736, - "loss/policy_avg": 0.46753692626953125, - "lr": 9.678553169734151e-06, - "objective/entropy": 182.4403533935547, - "objective/kl": 51.98662567138672, - "objective/non_score_reward": -2.5993313789367676, - "objective/rlhf_reward": -12.39732551574707, - "objective/scores": -0.5, - "policy/approxkl_avg": 97.35031127929688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.689453125, - "step": 503, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9995888471603394 - }, - { - "episode": 8080, - "epoch": 0.048411642760422285, - "loss/policy_avg": 0.38062262535095215, - "lr": 9.677914110429448e-06, - "objective/entropy": 148.70054626464844, - "objective/kl": 44.446372985839844, - "objective/non_score_reward": -2.222318649291992, - "objective/rlhf_reward": -10.889274597167969, - "objective/scores": -0.5, - "policy/approxkl_avg": 21.08675765991211, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.525390625, - "step": 504, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998737096786499 - }, - { - "episode": 8096, - "epoch": 0.048507507399551834, - "loss/policy_avg": 0.2571317255496979, - "lr": 9.677275051124745e-06, - "objective/entropy": 67.0958251953125, - "objective/kl": 42.69478988647461, - "objective/non_score_reward": -2.134739398956299, - "objective/rlhf_reward": -7.023185694011387, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 69.5447998046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.634765625, - "step": 505, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.995441198348999 - }, - { - "episode": 8112, - "epoch": 0.04860337203868138, - "loss/policy_avg": 0.26023709774017334, - "lr": 9.676635991820042e-06, - "objective/entropy": 134.39599609375, - "objective/kl": 50.59172821044922, - "objective/non_score_reward": -2.5295865535736084, - "objective/rlhf_reward": -12.118346214294434, - "objective/scores": -0.5, - "policy/approxkl_avg": 117.91007995605469, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.697265625, - "step": 506, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9979584217071533 - }, - { - "episode": 8128, - "epoch": 0.04869923667781093, - "loss/policy_avg": 0.023366611450910568, - "lr": 9.675996932515339e-06, - "objective/entropy": 6.523872375488281, - "objective/kl": 33.86302185058594, - "objective/non_score_reward": -1.6931511163711548, - "objective/rlhf_reward": -5.394002297011715, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 1.230093240737915, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4658203125, - "step": 507, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001136302947998 - }, - { - "episode": 8144, - "epoch": 0.04879510131694048, - "loss/policy_avg": 0.1962001621723175, - "lr": 9.675357873210634e-06, - "objective/entropy": 101.35081481933594, - "objective/kl": 51.96517562866211, - "objective/non_score_reward": -2.5982584953308105, - "objective/rlhf_reward": -9.033784830306454, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 6.275201797485352, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62109375, - "step": 508, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9995527267456055 - }, - { - "episode": 8160, - "epoch": 0.04889096595607003, - "loss/policy_avg": 0.32110342383384705, - "lr": 9.67471881390593e-06, - "objective/entropy": 21.122821807861328, - "objective/kl": 40.87200927734375, - "objective/non_score_reward": -2.04360032081604, - "objective/rlhf_reward": -3.774401760101318, - "objective/scores": 1.1, - "policy/approxkl_avg": 77.06642150878906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.826171875, - "step": 509, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9997953176498413 - }, - { - "episode": 8176, - "epoch": 0.04898683059519958, - "loss/policy_avg": 0.11684095114469528, - "lr": 9.674079754601228e-06, - "objective/entropy": 123.6054916381836, - "objective/kl": 43.744590759277344, - "objective/non_score_reward": -2.1872293949127197, - "objective/rlhf_reward": -4.348917520046234, - "objective/scores": 1.1, - "policy/approxkl_avg": 1.2953643798828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4482421875, - "step": 510, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.00039005279541 - }, - { - "episode": 8192, - "epoch": 0.04908269523432913, - "loss/policy_avg": 0.21204860508441925, - "lr": 9.673440695296525e-06, - "objective/entropy": 92.97704315185547, - "objective/kl": 33.71581268310547, - "objective/non_score_reward": -1.6857905387878418, - "objective/rlhf_reward": -8.743162155151367, - "objective/scores": -0.5, - "policy/approxkl_avg": 78.84205627441406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4951171875, - "step": 511, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001296997070312 - }, - { - "episode": 8208, - "epoch": 0.04917855987345868, - "loss/policy_avg": 0.2907988429069519, - "lr": 9.672801635991821e-06, - "objective/entropy": 142.2095184326172, - "objective/kl": 46.76347351074219, - "objective/non_score_reward": -2.3381738662719727, - "objective/rlhf_reward": -11.35269546508789, - "objective/scores": -0.5, - "policy/approxkl_avg": 4.935170650482178, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.56640625, - "step": 512, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9997193813323975 - }, - { - "episode": 8224, - "epoch": 0.049274424512588226, - "loss/policy_avg": -0.0883089154958725, - "lr": 9.672162576687117e-06, - "objective/entropy": 78.84781646728516, - "objective/kl": 37.297393798828125, - "objective/non_score_reward": -1.8648698329925537, - "objective/rlhf_reward": -7.459479093551636, - "objective/scores": 0.0, - "policy/approxkl_avg": 47.29024124145508, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4755859375, - "step": 513, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0175886154174805 - }, - { - "episode": 8240, - "epoch": 0.049370289151717775, - "loss/policy_avg": 1.227190375328064, - "lr": 9.671523517382413e-06, - "objective/entropy": 153.3360595703125, - "objective/kl": 42.09587097167969, - "objective/non_score_reward": -2.1047935485839844, - "objective/rlhf_reward": -10.419174194335938, - "objective/scores": -0.5, - "policy/approxkl_avg": 18.333316802978516, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.56640625, - "step": 514, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9991519451141357 - }, - { - "episode": 8256, - "epoch": 0.049466153790847324, - "loss/policy_avg": 0.5838215947151184, - "lr": 9.67088445807771e-06, - "objective/entropy": 139.77813720703125, - "objective/kl": 33.785945892333984, - "objective/non_score_reward": -1.6892971992492676, - "objective/rlhf_reward": -4.93236028698356, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 99.73694610595703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4208984375, - "step": 515, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9996287822723389 - }, - { - "episode": 8272, - "epoch": 0.04956201842997687, - "loss/policy_avg": 0.059313490986824036, - "lr": 9.670245398773007e-06, - "objective/entropy": 14.463359832763672, - "objective/kl": 34.218177795410156, - "objective/non_score_reward": -1.710909128189087, - "objective/rlhf_reward": -8.843635559082031, - "objective/scores": -0.5, - "policy/approxkl_avg": 35.5083122253418, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.65625, - "step": 516, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.997997760772705 - }, - { - "episode": 8288, - "epoch": 0.04965788306910642, - "loss/policy_avg": 0.2258259505033493, - "lr": 9.669606339468304e-06, - "objective/entropy": -95.49360656738281, - "objective/kl": 22.19683837890625, - "objective/non_score_reward": -1.10984206199646, - "objective/rlhf_reward": -4.439367949962616, - "objective/scores": 0.0, - "policy/approxkl_avg": 5.760451316833496, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.53515625, - "step": 517, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9969854354858398 - }, - { - "episode": 8304, - "epoch": 0.04975374770823597, - "loss/policy_avg": 0.10804040729999542, - "lr": 9.668967280163601e-06, - "objective/entropy": 143.58779907226562, - "objective/kl": 35.785003662109375, - "objective/non_score_reward": -1.789250135421753, - "objective/rlhf_reward": -9.157001495361328, - "objective/scores": -0.5, - "policy/approxkl_avg": 47.657981872558594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.794921875, - "step": 518, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9962890148162842 - }, - { - "episode": 8320, - "epoch": 0.04984961234736552, - "loss/policy_avg": 0.009697876870632172, - "lr": 9.668328220858896e-06, - "objective/entropy": 10.224929809570312, - "objective/kl": 37.23715591430664, - "objective/non_score_reward": -1.8618578910827637, - "objective/rlhf_reward": -5.891172378268793, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 1.469694972038269, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.3876953125, - "step": 519, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9995241165161133 - }, - { - "episode": 8336, - "epoch": 0.04994547698649507, - "loss/policy_avg": 0.07935798913240433, - "lr": 9.667689161554193e-06, - "objective/entropy": 37.64440155029297, - "objective/kl": 41.3823356628418, - "objective/non_score_reward": -2.0691165924072266, - "objective/rlhf_reward": -10.276466369628906, - "objective/scores": -0.5, - "policy/approxkl_avg": 27.435813903808594, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.546875, - "step": 520, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001011848449707 - }, - { - "episode": 8352, - "epoch": 0.05004134162562462, - "loss/policy_avg": -0.02070830762386322, - "lr": 9.66705010224949e-06, - "objective/entropy": 58.74858474731445, - "objective/kl": 43.55432891845703, - "objective/non_score_reward": -2.1777162551879883, - "objective/rlhf_reward": -7.332263686744076, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 6.468544960021973, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.607421875, - "step": 521, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001483917236328 - }, - { - "episode": 8368, - "epoch": 0.05013720626475417, - "loss/policy_avg": 0.46028798818588257, - "lr": 9.666411042944787e-06, - "objective/entropy": -78.76937866210938, - "objective/kl": 23.173397064208984, - "objective/non_score_reward": -1.1586699485778809, - "objective/rlhf_reward": -6.634679794311523, - "objective/scores": -0.5, - "policy/approxkl_avg": 24.691064834594727, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.51953125, - "step": 522, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9976732730865479 - }, - { - "episode": 8384, - "epoch": 0.050233070903883716, - "loss/policy_avg": 0.18045517802238464, - "lr": 9.665771983640082e-06, - "objective/entropy": 46.4280891418457, - "objective/kl": 40.75489044189453, - "objective/non_score_reward": -2.0377445220947266, - "objective/rlhf_reward": -8.150978326797485, - "objective/scores": 0.0, - "policy/approxkl_avg": 3.3056907653808594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.44921875, - "step": 523, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9977219104766846 - }, - { - "episode": 8400, - "epoch": 0.050328935543013265, - "loss/policy_avg": 0.14005348086357117, - "lr": 9.665132924335379e-06, - "objective/entropy": 30.292451858520508, - "objective/kl": 34.13593292236328, - "objective/non_score_reward": -1.706796646118164, - "objective/rlhf_reward": -8.827186584472656, - "objective/scores": -0.5, - "policy/approxkl_avg": 5.698740005493164, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4794921875, - "step": 524, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0000524520874023 - }, - { - "episode": 8416, - "epoch": 0.050424800182142814, - "loss/policy_avg": 0.06975753605365753, - "lr": 9.664493865030676e-06, - "objective/entropy": 83.52384948730469, - "objective/kl": 34.249717712402344, - "objective/non_score_reward": -1.7124860286712646, - "objective/rlhf_reward": -6.849944233894348, - "objective/scores": 0.0, - "policy/approxkl_avg": 24.646453857421875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.58203125, - "step": 525, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0034072399139404 - }, - { - "episode": 8432, - "epoch": 0.05052066482127236, - "loss/policy_avg": 0.22392672300338745, - "lr": 9.663854805725971e-06, - "objective/entropy": 64.02466583251953, - "objective/kl": 34.11146545410156, - "objective/non_score_reward": -1.705573320388794, - "objective/rlhf_reward": -8.822293281555176, - "objective/scores": -0.5, - "policy/approxkl_avg": 42.05424118041992, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.65234375, - "step": 526, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9970359802246094 - }, - { - "episode": 8448, - "epoch": 0.05061652946040191, - "loss/policy_avg": 1.0532065629959106, - "lr": 9.663215746421268e-06, - "objective/entropy": 90.72592163085938, - "objective/kl": 41.28602600097656, - "objective/non_score_reward": -2.0643014907836914, - "objective/rlhf_reward": -6.931692812472505, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 31.192419052124023, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.4990234375, - "step": 527, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0001697540283203 - }, - { - "episode": 8464, - "epoch": 0.05071239409953146, - "loss/policy_avg": 0.0644269585609436, - "lr": 9.662576687116565e-06, - "objective/entropy": 67.44807434082031, - "objective/kl": 36.54154586791992, - "objective/non_score_reward": -1.8270775079727173, - "objective/rlhf_reward": -7.30830979347229, - "objective/scores": 0.0, - "policy/approxkl_avg": 17.966569900512695, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5078125, - "step": 528, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998882532119751 - }, - { - "episode": 8480, - "epoch": 0.05080825873866101, - "loss/policy_avg": -0.2570219039916992, - "lr": 9.661937627811862e-06, - "objective/entropy": -26.96208953857422, - "objective/kl": 34.49934768676758, - "objective/non_score_reward": -1.7249674797058105, - "objective/rlhf_reward": -6.899869918823242, - "objective/scores": 0.0, - "policy/approxkl_avg": 20.390092849731445, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.3447265625, - "step": 529, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000749111175537 - }, - { - "episode": 8496, - "epoch": 0.05090412337779056, - "loss/policy_avg": 0.25002214312553406, - "lr": 9.661298568507158e-06, - "objective/entropy": 91.19659423828125, - "objective/kl": 35.412471771240234, - "objective/non_score_reward": -1.7706236839294434, - "objective/rlhf_reward": -9.082494735717773, - "objective/scores": -0.5, - "policy/approxkl_avg": 166.57394409179688, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4619140625, - "step": 530, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9965037107467651 - }, - { - "episode": 8512, - "epoch": 0.05099998801692011, - "loss/policy_avg": 0.022846542298793793, - "lr": 9.660659509202455e-06, - "objective/entropy": -37.26931381225586, - "objective/kl": 39.86629867553711, - "objective/non_score_reward": -1.9933149814605713, - "objective/rlhf_reward": -7.973260045051575, - "objective/scores": 0.0, - "policy/approxkl_avg": 56.44502258300781, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4677734375, - "step": 531, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9974274635314941 - }, - { - "episode": 8528, - "epoch": 0.051095852656049656, - "loss/policy_avg": 0.12022869288921356, - "lr": 9.66002044989775e-06, - "objective/entropy": 67.10712432861328, - "objective/kl": 41.30962371826172, - "objective/non_score_reward": -2.065481185913086, - "objective/rlhf_reward": -8.261924982070923, - "objective/scores": 0.0, - "policy/approxkl_avg": 46.29387664794922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.55859375, - "step": 532, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975337982177734 - }, - { - "episode": 8544, - "epoch": 0.051191717295179205, - "loss/policy_avg": 0.6448026895523071, - "lr": 9.659381390593047e-06, - "objective/entropy": 11.027664184570312, - "objective/kl": 40.83232498168945, - "objective/non_score_reward": -2.0416159629821777, - "objective/rlhf_reward": -8.16646432876587, - "objective/scores": 0.0, - "policy/approxkl_avg": 28.078399658203125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.60546875, - "step": 533, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000098943710327 - }, - { - "episode": 8560, - "epoch": 0.051287581934308754, - "loss/policy_avg": 0.8582497835159302, - "lr": 9.658742331288344e-06, - "objective/entropy": 113.14666748046875, - "objective/kl": 42.10472106933594, - "objective/non_score_reward": -2.105236053466797, - "objective/rlhf_reward": -8.420944571495056, - "objective/scores": 0.0, - "policy/approxkl_avg": 14.971941947937012, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.548828125, - "step": 534, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9971849918365479 - }, - { - "episode": 8576, - "epoch": 0.0513834465734383, - "loss/policy_avg": 0.038903310894966125, - "lr": 9.658103271983641e-06, - "objective/entropy": 143.5253448486328, - "objective/kl": 42.91957092285156, - "objective/non_score_reward": -2.1459789276123047, - "objective/rlhf_reward": -7.18391523361206, - "objective/scores": 0.35, - "policy/approxkl_avg": 17.93328857421875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.796875, - "step": 535, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9967997074127197 - }, - { - "episode": 8592, - "epoch": 0.05147931121256785, - "loss/policy_avg": 0.25293684005737305, - "lr": 9.657464212678938e-06, - "objective/entropy": 96.65731811523438, - "objective/kl": 40.929542541503906, - "objective/non_score_reward": -2.0464773178100586, - "objective/rlhf_reward": -3.7859089136123654, - "objective/scores": 1.1, - "policy/approxkl_avg": 86.96218872070312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.521484375, - "step": 536, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9961689710617065 - }, - { - "episode": 8608, - "epoch": 0.0515751758516974, - "loss/policy_avg": 0.3026430606842041, - "lr": 9.656825153374235e-06, - "objective/entropy": 28.206890106201172, - "objective/kl": 32.628379821777344, - "objective/non_score_reward": -1.6314189434051514, - "objective/rlhf_reward": -6.525675892829895, - "objective/scores": 0.0, - "policy/approxkl_avg": 24.484819412231445, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.529296875, - "step": 537, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.99712336063385 - }, - { - "episode": 8624, - "epoch": 0.05167104049082695, - "loss/policy_avg": -0.16653533279895782, - "lr": 9.65618609406953e-06, - "objective/entropy": -77.3916015625, - "objective/kl": 30.037582397460938, - "objective/non_score_reward": -1.5018792152404785, - "objective/rlhf_reward": -6.007516622543335, - "objective/scores": 0.0, - "policy/approxkl_avg": 87.15878295898438, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6015625, - "step": 538, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.007481575012207 - }, - { - "episode": 8640, - "epoch": 0.0517669051299565, - "loss/policy_avg": 0.11232887953519821, - "lr": 9.655547034764827e-06, - "objective/entropy": 55.08220291137695, - "objective/kl": 43.097896575927734, - "objective/non_score_reward": -2.1548948287963867, - "objective/rlhf_reward": -8.6195787191391, - "objective/scores": 0.0, - "policy/approxkl_avg": 26.015697479248047, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.60546875, - "step": 539, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0278120040893555 - }, - { - "episode": 8656, - "epoch": 0.05186276976908605, - "loss/policy_avg": 0.6124523878097534, - "lr": 9.654907975460124e-06, - "objective/entropy": 34.89768600463867, - "objective/kl": 46.468101501464844, - "objective/non_score_reward": -2.3234052658081055, - "objective/rlhf_reward": -9.293620705604553, - "objective/scores": 0.0, - "policy/approxkl_avg": 6.492743492126465, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3681640625, - "step": 540, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999890923500061 - }, - { - "episode": 8672, - "epoch": 0.0519586344082156, - "loss/policy_avg": 0.2736782133579254, - "lr": 9.65426891615542e-06, - "objective/entropy": 39.75823211669922, - "objective/kl": 30.90910530090332, - "objective/non_score_reward": -1.5454552173614502, - "objective/rlhf_reward": -4.666049086841282, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 31.34353256225586, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4521484375, - "step": 541, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000300884246826 - }, - { - "episode": 8688, - "epoch": 0.052054499047345146, - "loss/policy_avg": 0.4218829870223999, - "lr": 9.653629856850718e-06, - "objective/entropy": 208.8717041015625, - "objective/kl": 39.65779495239258, - "objective/non_score_reward": -1.9828898906707764, - "objective/rlhf_reward": -6.5529573942102015, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 68.46629333496094, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.662109375, - "step": 542, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9981236457824707 - }, - { - "episode": 8704, - "epoch": 0.052150363686474695, - "loss/policy_avg": 0.45531219244003296, - "lr": 9.652990797546013e-06, - "objective/entropy": 209.2900390625, - "objective/kl": 33.24062728881836, - "objective/non_score_reward": -1.6620312929153442, - "objective/rlhf_reward": -5.248125171661377, - "objective/scores": 0.35, - "policy/approxkl_avg": 40.11018371582031, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.658203125, - "step": 543, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9973430633544922 - }, - { - "episode": 8720, - "epoch": 0.052246228325604244, - "loss/policy_avg": 0.21141277253627777, - "lr": 9.65235173824131e-06, - "objective/entropy": 158.35287475585938, - "objective/kl": 32.80064392089844, - "objective/non_score_reward": -1.6400320529937744, - "objective/rlhf_reward": -8.560128211975098, - "objective/scores": -0.5, - "policy/approxkl_avg": 55.47556686401367, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.658203125, - "step": 544, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9978859424591064 - }, - { - "episode": 8736, - "epoch": 0.05234209296473379, - "loss/policy_avg": 0.4950694143772125, - "lr": 9.651712678936605e-06, - "objective/entropy": 92.91190338134766, - "objective/kl": 28.654132843017578, - "objective/non_score_reward": -1.4327068328857422, - "objective/rlhf_reward": -3.905998165878366, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 13.473176956176758, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.677734375, - "step": 545, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9976662397384644 - }, - { - "episode": 8752, - "epoch": 0.05243795760386334, - "loss/policy_avg": 0.1986934244632721, - "lr": 9.651073619631902e-06, - "objective/entropy": 72.54715728759766, - "objective/kl": 33.77407455444336, - "objective/non_score_reward": -1.6887036561965942, - "objective/rlhf_reward": -6.754814624786377, - "objective/scores": 0.0, - "policy/approxkl_avg": 13.790899276733398, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.619140625, - "step": 546, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987945556640625 - }, - { - "episode": 8768, - "epoch": 0.0525338222429929, - "loss/policy_avg": 0.17602220177650452, - "lr": 9.650434560327199e-06, - "objective/entropy": 19.197650909423828, - "objective/kl": 42.189781188964844, - "objective/non_score_reward": -2.1094889640808105, - "objective/rlhf_reward": -5.514237318874571, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 83.59529113769531, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.478515625, - "step": 547, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9978179931640625 - }, - { - "episode": 8784, - "epoch": 0.052629686882122446, - "loss/policy_avg": 0.045998621731996536, - "lr": 9.649795501022496e-06, - "objective/entropy": 158.06063842773438, - "objective/kl": 40.811397552490234, - "objective/non_score_reward": -2.04056978225708, - "objective/rlhf_reward": -10.16227912902832, - "objective/scores": -0.5, - "policy/approxkl_avg": 111.83248901367188, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4892578125, - "step": 548, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9980087280273438 - }, - { - "episode": 8800, - "epoch": 0.052725551521251995, - "loss/policy_avg": 0.30770862102508545, - "lr": 9.649156441717792e-06, - "objective/entropy": 11.81429672241211, - "objective/kl": 30.653812408447266, - "objective/non_score_reward": -1.5326905250549316, - "objective/rlhf_reward": -4.52664235598238, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 12.983512878417969, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.4384765625, - "step": 549, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9991545677185059 - }, - { - "episode": 8816, - "epoch": 0.052821416160381544, - "loss/policy_avg": 0.14274156093597412, - "lr": 9.64851738241309e-06, - "objective/entropy": 103.85163116455078, - "objective/kl": 43.19879913330078, - "objective/non_score_reward": -2.159940004348755, - "objective/rlhf_reward": -6.517053785101448, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 100.42656707763672, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6875, - "step": 550, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9965872764587402 - }, - { - "episode": 8832, - "epoch": 0.05291728079951109, - "loss/policy_avg": 0.06003594398498535, - "lr": 9.647878323108384e-06, - "objective/entropy": 179.23623657226562, - "objective/kl": 40.46935272216797, - "objective/non_score_reward": -2.023467779159546, - "objective/rlhf_reward": -6.752235582380919, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 20.83936882019043, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5625, - "step": 551, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.001079797744751 - }, - { - "episode": 8848, - "epoch": 0.05301314543864064, - "loss/policy_avg": 0.5863113403320312, - "lr": 9.647239263803681e-06, - "objective/entropy": 76.55807495117188, - "objective/kl": 36.17455291748047, - "objective/non_score_reward": -1.8087276220321655, - "objective/rlhf_reward": -9.23491096496582, - "objective/scores": -0.5, - "policy/approxkl_avg": 4.267086029052734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3798828125, - "step": 552, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001115322113037 - }, - { - "episode": 8864, - "epoch": 0.05310901007777019, - "loss/policy_avg": 2.5198092460632324, - "lr": 9.646600204498978e-06, - "objective/entropy": 31.032962799072266, - "objective/kl": 36.13847351074219, - "objective/non_score_reward": -1.8069238662719727, - "objective/rlhf_reward": -9.22769546508789, - "objective/scores": -0.5, - "policy/approxkl_avg": 5.862698554992676, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.568359375, - "step": 553, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0627989768981934 - }, - { - "episode": 8880, - "epoch": 0.05320487471689974, - "loss/policy_avg": 0.135384663939476, - "lr": 9.645961145194275e-06, - "objective/entropy": 39.74359893798828, - "objective/kl": 45.49197006225586, - "objective/non_score_reward": -2.2745985984802246, - "objective/rlhf_reward": -11.098394393920898, - "objective/scores": -0.5, - "policy/approxkl_avg": 11.076536178588867, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5703125, - "step": 554, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0400075912475586 - }, - { - "episode": 8896, - "epoch": 0.05330073935602929, - "loss/policy_avg": 0.17068403959274292, - "lr": 9.645322085889572e-06, - "objective/entropy": 131.28785705566406, - "objective/kl": 41.12070846557617, - "objective/non_score_reward": -2.056035280227661, - "objective/rlhf_reward": -6.913460275134444, - "objective/scores": 0.327670248696953, - "policy/approxkl_avg": 19.08953094482422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.396484375, - "step": 555, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9997930526733398 - }, - { - "episode": 8912, - "epoch": 0.05339660399515884, - "loss/policy_avg": 0.006861642003059387, - "lr": 9.644683026584867e-06, - "objective/entropy": 71.55320739746094, - "objective/kl": 31.577491760253906, - "objective/non_score_reward": -1.5788745880126953, - "objective/rlhf_reward": -4.75923916598852, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 11.279380798339844, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.58984375, - "step": 556, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0002732276916504 - }, - { - "episode": 8928, - "epoch": 0.05349246863428839, - "loss/policy_avg": 0.3932980000972748, - "lr": 9.644043967280164e-06, - "objective/entropy": 173.74667358398438, - "objective/kl": 39.91008758544922, - "objective/non_score_reward": -1.9955044984817505, - "objective/rlhf_reward": -6.157189126285623, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 75.61815643310547, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5390625, - "step": 557, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9991559982299805 - }, - { - "episode": 8944, - "epoch": 0.053588333273417936, - "loss/policy_avg": 0.41776636242866516, - "lr": 9.643404907975461e-06, - "objective/entropy": 101.93467712402344, - "objective/kl": 33.950069427490234, - "objective/non_score_reward": -1.6975035667419434, - "objective/rlhf_reward": -8.790014266967773, - "objective/scores": -0.5, - "policy/approxkl_avg": 2.91536021232605, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7265625, - "step": 558, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994499683380127 - }, - { - "episode": 8960, - "epoch": 0.053684197912547485, - "loss/policy_avg": 3.235675811767578, - "lr": 9.642765848670758e-06, - "objective/entropy": 157.3035888671875, - "objective/kl": 48.83702087402344, - "objective/non_score_reward": -2.4418513774871826, - "objective/rlhf_reward": -11.76740550994873, - "objective/scores": -0.5, - "policy/approxkl_avg": 152.27467346191406, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.474609375, - "step": 559, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0070905685424805 - }, - { - "episode": 8976, - "epoch": 0.053780062551677034, - "loss/policy_avg": 0.4991706907749176, - "lr": 9.642126789366055e-06, - "objective/entropy": 163.01913452148438, - "objective/kl": 44.97026062011719, - "objective/non_score_reward": -2.2485132217407227, - "objective/rlhf_reward": -6.5940522909164425, - "objective/scores": 0.6, - "policy/approxkl_avg": 54.085365295410156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.58203125, - "step": 560, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9990899562835693 - }, - { - "episode": 8992, - "epoch": 0.05387592719080658, - "loss/policy_avg": 0.35669660568237305, - "lr": 9.641487730061352e-06, - "objective/entropy": 46.909828186035156, - "objective/kl": 31.477493286132812, - "objective/non_score_reward": -1.573874592781067, - "objective/rlhf_reward": -4.953862717657714, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 65.30628967285156, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.533203125, - "step": 561, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9979372024536133 - }, - { - "episode": 9008, - "epoch": 0.05397179182993613, - "loss/policy_avg": 1.0564548969268799, - "lr": 9.640848670756647e-06, - "objective/entropy": 144.66534423828125, - "objective/kl": 50.382301330566406, - "objective/non_score_reward": -2.5191149711608887, - "objective/rlhf_reward": -8.560688340457615, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 48.65314483642578, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.748046875, - "step": 562, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982779026031494 - }, - { - "episode": 9024, - "epoch": 0.05406765646906568, - "loss/policy_avg": 0.11716368794441223, - "lr": 9.640209611451944e-06, - "objective/entropy": 98.48894500732422, - "objective/kl": 28.6131649017334, - "objective/non_score_reward": -1.4306584596633911, - "objective/rlhf_reward": -7.7226338386535645, - "objective/scores": -0.5, - "policy/approxkl_avg": 4.523138999938965, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4130859375, - "step": 563, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0049519538879395 - }, - { - "episode": 9040, - "epoch": 0.05416352110819523, - "loss/policy_avg": 0.7250778675079346, - "lr": 9.63957055214724e-06, - "objective/entropy": 133.11463928222656, - "objective/kl": 37.375694274902344, - "objective/non_score_reward": -1.868784785270691, - "objective/rlhf_reward": -5.075139141082763, - "objective/scores": 0.6, - "policy/approxkl_avg": 13.915103912353516, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.41796875, - "step": 564, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9986052513122559 - }, - { - "episode": 9056, - "epoch": 0.05425938574732478, - "loss/policy_avg": 0.7018356323242188, - "lr": 9.638931492842537e-06, - "objective/entropy": 89.61590576171875, - "objective/kl": 41.851219177246094, - "objective/non_score_reward": -2.0925612449645996, - "objective/rlhf_reward": -6.919646601291046, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 24.61191177368164, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.54296875, - "step": 565, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9996438026428223 - }, - { - "episode": 9072, - "epoch": 0.05435525038645433, - "loss/policy_avg": 0.07808268815279007, - "lr": 9.638292433537834e-06, - "objective/entropy": 67.98313903808594, - "objective/kl": 45.94068145751953, - "objective/non_score_reward": -2.29703426361084, - "objective/rlhf_reward": -9.188136219978333, - "objective/scores": 0.0, - "policy/approxkl_avg": 38.387054443359375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.57421875, - "step": 566, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9997014999389648 - }, - { - "episode": 9088, - "epoch": 0.05445111502558388, - "loss/policy_avg": 0.0561227947473526, - "lr": 9.63765337423313e-06, - "objective/entropy": -102.10490417480469, - "objective/kl": 32.92976379394531, - "objective/non_score_reward": -1.6464881896972656, - "objective/rlhf_reward": -6.585952877998352, - "objective/scores": 0.0, - "policy/approxkl_avg": 9.602782249450684, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.54296875, - "step": 567, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0004687309265137 - }, - { - "episode": 9104, - "epoch": 0.054546979664713426, - "loss/policy_avg": 0.5744443535804749, - "lr": 9.637014314928426e-06, - "objective/entropy": 78.12531280517578, - "objective/kl": 36.76774597167969, - "objective/non_score_reward": -1.8383872509002686, - "objective/rlhf_reward": -9.353549003601074, - "objective/scores": -0.5, - "policy/approxkl_avg": 55.96208572387695, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.51171875, - "step": 568, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9991743564605713 - }, - { - "episode": 9120, - "epoch": 0.054642844303842975, - "loss/policy_avg": 0.564866304397583, - "lr": 9.636375255623721e-06, - "objective/entropy": 135.82896423339844, - "objective/kl": 43.04612731933594, - "objective/non_score_reward": -2.15230655670166, - "objective/rlhf_reward": -10.60922622680664, - "objective/scores": -0.5, - "policy/approxkl_avg": 42.51314163208008, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4541015625, - "step": 569, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000321388244629 - }, - { - "episode": 9136, - "epoch": 0.054738708942972523, - "loss/policy_avg": 0.16940301656723022, - "lr": 9.635736196319018e-06, - "objective/entropy": -114.63182830810547, - "objective/kl": 36.86830139160156, - "objective/non_score_reward": -1.8434150218963623, - "objective/rlhf_reward": -9.373659133911133, - "objective/scores": -0.5, - "policy/approxkl_avg": 26.11627960205078, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.67578125, - "step": 570, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9971542358398438 - }, - { - "episode": 9152, - "epoch": 0.05483457358210207, - "loss/policy_avg": 0.020508363842964172, - "lr": 9.635097137014315e-06, - "objective/entropy": 87.90492248535156, - "objective/kl": 46.06084442138672, - "objective/non_score_reward": -2.303041934967041, - "objective/rlhf_reward": -9.212168216705322, - "objective/scores": 0.0, - "policy/approxkl_avg": 56.395973205566406, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.482421875, - "step": 571, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000852584838867 - }, - { - "episode": 9168, - "epoch": 0.05493043822123162, - "loss/policy_avg": -0.007636541500687599, - "lr": 9.634458077709612e-06, - "objective/entropy": -84.44242858886719, - "objective/kl": 36.771697998046875, - "objective/non_score_reward": -1.8385847806930542, - "objective/rlhf_reward": -5.995089256499691, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 0.3396506905555725, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.52734375, - "step": 572, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0062503814697266 - }, - { - "episode": 9184, - "epoch": 0.05502630286036117, - "loss/policy_avg": -0.1648610532283783, - "lr": 9.633819018404909e-06, - "objective/entropy": -70.38121795654297, - "objective/kl": 37.25817108154297, - "objective/non_score_reward": -1.8629084825515747, - "objective/rlhf_reward": -9.45163345336914, - "objective/scores": -0.5, - "policy/approxkl_avg": 35.33013916015625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.455078125, - "step": 573, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978995323181152 - }, - { - "episode": 9200, - "epoch": 0.05512216749949072, - "loss/policy_avg": 0.21912901103496552, - "lr": 9.633179959100206e-06, - "objective/entropy": 65.07223510742188, - "objective/kl": 37.938026428222656, - "objective/non_score_reward": -1.8969011306762695, - "objective/rlhf_reward": -5.854271189371744, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 179.24462890625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.634765625, - "step": 574, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9971550703048706 - }, - { - "episode": 9216, - "epoch": 0.05521803213862027, - "loss/policy_avg": 0.42310407757759094, - "lr": 9.632540899795501e-06, - "objective/entropy": -74.82484436035156, - "objective/kl": 36.85780334472656, - "objective/non_score_reward": -1.8428901433944702, - "objective/rlhf_reward": -5.54673218277366, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 7.413464546203613, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.53515625, - "step": 575, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000021457672119 - }, - { - "episode": 9232, - "epoch": 0.05531389677774982, - "loss/policy_avg": 0.1522914469242096, - "lr": 9.631901840490798e-06, - "objective/entropy": -78.68695068359375, - "objective/kl": 36.93750762939453, - "objective/non_score_reward": -1.8468754291534424, - "objective/rlhf_reward": -7.387501657009125, - "objective/scores": 0.0, - "policy/approxkl_avg": 299.15435791015625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625, - "step": 576, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0027832984924316 - }, - { - "episode": 9248, - "epoch": 0.055409761416879366, - "loss/policy_avg": 0.09173239022493362, - "lr": 9.631262781186095e-06, - "objective/entropy": 206.1100616455078, - "objective/kl": 36.94264221191406, - "objective/non_score_reward": -1.8471322059631348, - "objective/rlhf_reward": -6.009926774588925, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 51.173519134521484, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.623046875, - "step": 577, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9956302642822266 - }, - { - "episode": 9264, - "epoch": 0.055505626056008915, - "loss/policy_avg": 0.8109121322631836, - "lr": 9.630623721881392e-06, - "objective/entropy": -66.37505340576172, - "objective/kl": 42.11962127685547, - "objective/non_score_reward": -2.1059811115264893, - "objective/rlhf_reward": -7.023924565315246, - "objective/scores": 0.35, - "policy/approxkl_avg": 154.45883178710938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.568359375, - "step": 578, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999234676361084 - }, - { - "episode": 9280, - "epoch": 0.055601490695138464, - "loss/policy_avg": 0.20485125482082367, - "lr": 9.629984662576689e-06, - "objective/entropy": 172.38034057617188, - "objective/kl": 52.39830780029297, - "objective/non_score_reward": -2.61991548538208, - "objective/rlhf_reward": -9.079662299156189, - "objective/scores": 0.35, - "policy/approxkl_avg": 22.601041793823242, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.50390625, - "step": 579, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0032167434692383 - }, - { - "episode": 9296, - "epoch": 0.05569735533426801, - "loss/policy_avg": 0.7395042181015015, - "lr": 9.629345603271984e-06, - "objective/entropy": 7.457405090332031, - "objective/kl": 45.84647750854492, - "objective/non_score_reward": -2.2923238277435303, - "objective/rlhf_reward": -9.169295310974121, - "objective/scores": 0.0, - "policy/approxkl_avg": 124.10131072998047, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.54296875, - "step": 580, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992761611938477 - }, - { - "episode": 9312, - "epoch": 0.05579321997339756, - "loss/policy_avg": 0.23498813807964325, - "lr": 9.62870654396728e-06, - "objective/entropy": -11.64200210571289, - "objective/kl": 43.51708984375, - "objective/non_score_reward": -2.175854444503784, - "objective/rlhf_reward": -4.303417301177978, - "objective/scores": 1.1, - "policy/approxkl_avg": 15.8449068069458, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.587890625, - "step": 581, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0007996559143066 - }, - { - "episode": 9328, - "epoch": 0.05588908461252711, - "loss/policy_avg": -0.26322293281555176, - "lr": 9.628067484662578e-06, - "objective/entropy": 14.886768341064453, - "objective/kl": 48.95528030395508, - "objective/non_score_reward": -2.4477639198303223, - "objective/rlhf_reward": -8.431805813048763, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.678018569946289, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.43359375, - "step": 582, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0002763271331787 - }, - { - "episode": 9344, - "epoch": 0.05598494925165666, - "loss/policy_avg": 0.37311238050460815, - "lr": 9.627428425357874e-06, - "objective/entropy": -29.544679641723633, - "objective/kl": 44.6279411315918, - "objective/non_score_reward": -2.2313971519470215, - "objective/rlhf_reward": -7.474990586848602, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 120.46185302734375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4990234375, - "step": 583, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997020959854126 - }, - { - "episode": 9360, - "epoch": 0.05608081389078621, - "loss/policy_avg": 0.3765791654586792, - "lr": 9.626789366053171e-06, - "objective/entropy": 27.12002182006836, - "objective/kl": 39.4703369140625, - "objective/non_score_reward": -1.9735169410705566, - "objective/rlhf_reward": -9.894067764282227, - "objective/scores": -0.5, - "policy/approxkl_avg": 4.805877208709717, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.529296875, - "step": 584, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993363618850708 - }, - { - "episode": 9376, - "epoch": 0.05617667852991576, - "loss/policy_avg": 0.03627479076385498, - "lr": 9.626150306748468e-06, - "objective/entropy": 15.946697235107422, - "objective/kl": 54.321632385253906, - "objective/non_score_reward": -2.7160816192626953, - "objective/rlhf_reward": -12.864326477050781, - "objective/scores": -0.5, - "policy/approxkl_avg": 170.80502319335938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5078125, - "step": 585, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0002691745758057 - }, - { - "episode": 9392, - "epoch": 0.05627254316904531, - "loss/policy_avg": -0.02700839936733246, - "lr": 9.625511247443763e-06, - "objective/entropy": 86.60926055908203, - "objective/kl": 35.19524383544922, - "objective/non_score_reward": -1.7597622871398926, - "objective/rlhf_reward": -9.03904914855957, - "objective/scores": -0.5, - "policy/approxkl_avg": 8.260379791259766, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.755859375, - "step": 586, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0021920204162598 - }, - { - "episode": 9408, - "epoch": 0.056368407808174856, - "loss/policy_avg": 0.1763666421175003, - "lr": 9.62487218813906e-06, - "objective/entropy": 144.70706176757812, - "objective/kl": 23.635494232177734, - "objective/non_score_reward": -1.181774616241455, - "objective/rlhf_reward": -6.72709846496582, - "objective/scores": -0.5, - "policy/approxkl_avg": 5.851166725158691, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6796875, - "step": 587, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9997590780258179 - }, - { - "episode": 9424, - "epoch": 0.056464272447304405, - "loss/policy_avg": 0.527219295501709, - "lr": 9.624233128834357e-06, - "objective/entropy": 173.60789489746094, - "objective/kl": 42.092262268066406, - "objective/non_score_reward": -2.1046133041381836, - "objective/rlhf_reward": -8.418453335762024, - "objective/scores": 0.0, - "policy/approxkl_avg": 93.84263610839844, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5078125, - "step": 588, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.996020793914795 - }, - { - "episode": 9440, - "epoch": 0.056560137086433954, - "loss/policy_avg": 0.16997206211090088, - "lr": 9.623594069529654e-06, - "objective/entropy": 100.782470703125, - "objective/kl": 33.46315002441406, - "objective/non_score_reward": -1.6731575727462769, - "objective/rlhf_reward": -8.692630767822266, - "objective/scores": -0.5, - "policy/approxkl_avg": 23.64217185974121, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.68359375, - "step": 589, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9955521821975708 - }, - { - "episode": 9456, - "epoch": 0.0566560017255635, - "loss/policy_avg": 0.43916282057762146, - "lr": 9.62295501022495e-06, - "objective/entropy": -60.59128189086914, - "objective/kl": 42.55094909667969, - "objective/non_score_reward": -2.1275475025177, - "objective/rlhf_reward": -10.5101900100708, - "objective/scores": -0.5, - "policy/approxkl_avg": 162.38119506835938, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6796875, - "step": 590, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9992976188659668 - }, - { - "episode": 9472, - "epoch": 0.05675186636469305, - "loss/policy_avg": 0.412034809589386, - "lr": 9.622315950920246e-06, - "objective/entropy": 55.43243408203125, - "objective/kl": 33.46851348876953, - "objective/non_score_reward": -1.6734256744384766, - "objective/rlhf_reward": -8.693702697753906, - "objective/scores": -0.5, - "policy/approxkl_avg": 149.49728393554688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.595703125, - "step": 591, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9977531433105469 - }, - { - "episode": 9488, - "epoch": 0.0568477310038226, - "loss/policy_avg": 0.28347572684288025, - "lr": 9.621676891615543e-06, - "objective/entropy": 35.55633544921875, - "objective/kl": 40.687564849853516, - "objective/non_score_reward": -2.0343782901763916, - "objective/rlhf_reward": -5.213794265629026, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 117.67996215820312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.591796875, - "step": 592, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9995044469833374 - }, - { - "episode": 9504, - "epoch": 0.05694359564295215, - "loss/policy_avg": 0.7297570705413818, - "lr": 9.621037832310838e-06, - "objective/entropy": -104.14797973632812, - "objective/kl": 34.44893264770508, - "objective/non_score_reward": -1.7224466800689697, - "objective/rlhf_reward": -6.889786720275879, - "objective/scores": 0.0, - "policy/approxkl_avg": 22.624303817749023, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4912109375, - "step": 593, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997528314590454 - }, - { - "episode": 9520, - "epoch": 0.0570394602820817, - "loss/policy_avg": 0.0011496543884277344, - "lr": 9.620398773006135e-06, - "objective/entropy": 224.37185668945312, - "objective/kl": 27.430057525634766, - "objective/non_score_reward": -1.3715028762817383, - "objective/rlhf_reward": -3.5386005146073654, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 5.8247833251953125, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.71875, - "step": 594, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000392436981201 - }, - { - "episode": 9536, - "epoch": 0.05713532492121125, - "loss/policy_avg": 0.23965345323085785, - "lr": 9.619759713701432e-06, - "objective/entropy": 25.137657165527344, - "objective/kl": 37.088069915771484, - "objective/non_score_reward": -1.8544034957885742, - "objective/rlhf_reward": -7.4176143407821655, - "objective/scores": 0.0, - "policy/approxkl_avg": 9.948873519897461, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.44921875, - "step": 595, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9978184700012207 - }, - { - "episode": 9552, - "epoch": 0.057231189560340796, - "loss/policy_avg": 0.5549752712249756, - "lr": 9.619120654396729e-06, - "objective/entropy": -2.4383678436279297, - "objective/kl": 42.59381103515625, - "objective/non_score_reward": -2.129690408706665, - "objective/rlhf_reward": -10.51876163482666, - "objective/scores": -0.5, - "policy/approxkl_avg": 311.0433349609375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6015625, - "step": 596, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999363899230957 - }, - { - "episode": 9568, - "epoch": 0.057327054199470345, - "loss/policy_avg": 0.4941880702972412, - "lr": 9.618481595092026e-06, - "objective/entropy": 94.06913757324219, - "objective/kl": 44.376983642578125, - "objective/non_score_reward": -2.2188491821289062, - "objective/rlhf_reward": -10.875396728515625, - "objective/scores": -0.5, - "policy/approxkl_avg": 36.64936065673828, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.3056640625, - "step": 597, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.996791958808899 - }, - { - "episode": 9584, - "epoch": 0.057422918838599894, - "loss/policy_avg": 0.269217848777771, - "lr": 9.617842535787323e-06, - "objective/entropy": 140.2623291015625, - "objective/kl": 48.70625305175781, - "objective/non_score_reward": -2.4353127479553223, - "objective/rlhf_reward": -9.74125051498413, - "objective/scores": 0.0, - "policy/approxkl_avg": 5.9949951171875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.435546875, - "step": 598, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9980602264404297 - }, - { - "episode": 9600, - "epoch": 0.05751878347772944, - "loss/policy_avg": 8.679291725158691, - "lr": 9.617203476482618e-06, - "objective/entropy": 132.02699279785156, - "objective/kl": 40.58224868774414, - "objective/non_score_reward": -2.0291123390197754, - "objective/rlhf_reward": -10.116449356079102, - "objective/scores": -0.5, - "policy/approxkl_avg": 24.197277069091797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.71875, - "step": 599, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.996961236000061 - } - ], - "logging_steps": 500, - "max_steps": 7824, - "num_input_tokens_seen": 0, - "num_train_epochs": 3.0, - "save_steps": 200, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 0, - "train_batch_size": null, - "trial_name": null, - "trial_params": null -}