diff --git "a/checkpoint-1600/trainer_state.json" "b/checkpoint-1600/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1600/trainer_state.json" @@ -0,0 +1,28834 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "episode": 25600, + "epoch": 0.15338342260727852, + "eval_steps": 500, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "episode": 16, + "epoch": 9.586463912954908e-05, + "loss/policy_avg": 0.0339290015399456, + "lr": 1e-05, + "objective/entropy": 78.48619842529297, + "objective/kl": 5.6675214767456055, + "objective/non_score_reward": -0.2833760380744934, + "objective/rlhf_reward": 3.2664958328008655, + "objective/scores": 1.1, + "policy/approxkl_avg": 56.270538330078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5703125, + "step": 0, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000828266143799 + }, + { + "episode": 32, + "epoch": 0.00019172927825909816, + "loss/policy_avg": 0.032509539276361465, + "lr": 9.999360940695298e-06, + "objective/entropy": 39.34157943725586, + "objective/kl": 8.134885787963867, + "objective/non_score_reward": -0.40674424171447754, + "objective/rlhf_reward": -1.6269769463688135, + "objective/scores": 0.0, + "policy/approxkl_avg": 125.53129577636719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46875, + "step": 1, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994192123413086 + }, + { + "episode": 48, + "epoch": 0.00028759391738864725, + "loss/policy_avg": 0.2574540972709656, + "lr": 9.998721881390595e-06, + "objective/entropy": 35.90438461303711, + "objective/kl": 10.056818008422852, + "objective/non_score_reward": -0.5028409957885742, + "objective/rlhf_reward": -4.011363983154297, + "objective/scores": -0.5, + "policy/approxkl_avg": 197.81790161132812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.73828125, + "step": 2, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99604332447052 + }, + { + "episode": 64, + "epoch": 0.0003834585565181963, + "loss/policy_avg": 0.1315518617630005, + "lr": 9.99808282208589e-06, + "objective/entropy": 163.52642822265625, + "objective/kl": 12.497467041015625, + "objective/non_score_reward": -0.6248733997344971, + "objective/rlhf_reward": -2.499493680894375, + "objective/scores": 0.0, + "policy/approxkl_avg": 280.7725830078125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.716796875, + "step": 3, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999171257019043 + }, + { + "episode": 80, + "epoch": 0.0004793231956477454, + "loss/policy_avg": 0.024046147242188454, + "lr": 9.997443762781187e-06, + "objective/entropy": 118.5094223022461, + "objective/kl": 5.982309818267822, + "objective/non_score_reward": -0.29911553859710693, + "objective/rlhf_reward": -1.196462158113718, + "objective/scores": 0.0, + "policy/approxkl_avg": 52.543487548828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.552734375, + "step": 4, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001847743988037 + }, + { + "episode": 96, + "epoch": 0.0005751878347772945, + "loss/policy_avg": 0.10632362961769104, + "lr": 9.996804703476484e-06, + "objective/entropy": 152.1885986328125, + "objective/kl": 7.815367698669434, + "objective/non_score_reward": -0.3907684087753296, + "objective/rlhf_reward": 0.0987858943933384, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 88.51527404785156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5859375, + "step": 5, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992973804473877 + }, + { + "episode": 112, + "epoch": 0.0006710524739068436, + "loss/policy_avg": 0.08422186970710754, + "lr": 9.99616564417178e-06, + "objective/entropy": 75.0154037475586, + "objective/kl": 17.52770233154297, + "objective/non_score_reward": -0.8763852119445801, + "objective/rlhf_reward": -3.5055407360196114, + "objective/scores": 0.0, + "policy/approxkl_avg": 443.602294921875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.505859375, + "step": 6, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9972598552703857 + }, + { + "episode": 128, + "epoch": 0.0007669171130363926, + "loss/policy_avg": 0.007405903190374374, + "lr": 9.995526584867077e-06, + "objective/entropy": 51.67172622680664, + "objective/kl": 11.563663482666016, + "objective/non_score_reward": -0.5781831741333008, + "objective/rlhf_reward": -4.312732696533203, + "objective/scores": -0.5, + "policy/approxkl_avg": 126.90079498291016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.546875, + "step": 7, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0016322135925293 + }, + { + "episode": 144, + "epoch": 0.0008627817521659417, + "loss/policy_avg": 0.13771404325962067, + "lr": 9.994887525562374e-06, + "objective/entropy": 240.35464477539062, + "objective/kl": 18.096904754638672, + "objective/non_score_reward": -0.9048453569412231, + "objective/rlhf_reward": -1.6719702733325317, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 460.8926696777344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.90625, + "step": 8, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999654293060303 + }, + { + "episode": 160, + "epoch": 0.0009586463912954908, + "loss/policy_avg": 0.41069674491882324, + "lr": 9.99424846625767e-06, + "objective/entropy": 224.78262329101562, + "objective/kl": 11.231921195983887, + "objective/non_score_reward": -0.5615960955619812, + "objective/rlhf_reward": -0.8225522383051791, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 167.4181671142578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7578125, + "step": 9, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9952213764190674 + }, + { + "episode": 176, + "epoch": 0.0010545110304250398, + "loss/policy_avg": 0.2340843677520752, + "lr": 9.993609406952966e-06, + "objective/entropy": 77.48204040527344, + "objective/kl": 13.726895332336426, + "objective/non_score_reward": -0.6863448619842529, + "objective/rlhf_reward": -0.34537934362888345, + "objective/scores": 0.6, + "policy/approxkl_avg": 270.8516845703125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.876953125, + "step": 10, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9955778121948242 + }, + { + "episode": 192, + "epoch": 0.001150375669554589, + "loss/policy_avg": 0.1845349222421646, + "lr": 9.992970347648263e-06, + "objective/entropy": -45.138362884521484, + "objective/kl": 14.76271915435791, + "objective/non_score_reward": -0.7381359338760376, + "objective/rlhf_reward": -1.2192103425661722, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 207.85874938964844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.703125, + "step": 11, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000040292739868 + }, + { + "episode": 208, + "epoch": 0.001246240308684138, + "loss/policy_avg": 0.6059431433677673, + "lr": 9.992331288343558e-06, + "objective/entropy": 40.190372467041016, + "objective/kl": 19.720378875732422, + "objective/non_score_reward": -0.9860190749168396, + "objective/rlhf_reward": -1.821370030120883, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 268.6492919921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.654296875, + "step": 12, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9966726303100586 + }, + { + "episode": 224, + "epoch": 0.0013421049478136871, + "loss/policy_avg": -0.0064672790467739105, + "lr": 9.991692229038855e-06, + "objective/entropy": 108.48332214355469, + "objective/kl": 5.689068794250488, + "objective/non_score_reward": -0.28445348143577576, + "objective/rlhf_reward": -1.1378139406442642, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.834894180297852, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4716796875, + "step": 13, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0026183128356934 + }, + { + "episode": 240, + "epoch": 0.001437969586943236, + "loss/policy_avg": 0.6670212745666504, + "lr": 9.991053169734152e-06, + "objective/entropy": 0.18174362182617188, + "objective/kl": 12.982845306396484, + "objective/non_score_reward": -0.6491422653198242, + "objective/rlhf_reward": -2.596569076180458, + "objective/scores": 0.0, + "policy/approxkl_avg": 330.118896484375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.759765625, + "step": 14, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997687339782715 + }, + { + "episode": 256, + "epoch": 0.0015338342260727853, + "loss/policy_avg": 0.21263472735881805, + "lr": 9.990414110429449e-06, + "objective/entropy": 249.88232421875, + "objective/kl": 9.040252685546875, + "objective/non_score_reward": -0.45201271772384644, + "objective/rlhf_reward": -1.808050960302353, + "objective/scores": 0.0, + "policy/approxkl_avg": 102.57914733886719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.744140625, + "step": 15, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000203847885132 + }, + { + "episode": 272, + "epoch": 0.0016296988652023342, + "loss/policy_avg": 0.01660698838531971, + "lr": 9.989775051124744e-06, + "objective/entropy": 106.64703369140625, + "objective/kl": 11.038640022277832, + "objective/non_score_reward": -0.5519319772720337, + "objective/rlhf_reward": -2.2077280431985855, + "objective/scores": 0.0, + "policy/approxkl_avg": 164.40890502929688, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.654296875, + "step": 16, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000194549560547 + }, + { + "episode": 288, + "epoch": 0.0017255635043318834, + "loss/policy_avg": 0.17964985966682434, + "lr": 9.989135991820041e-06, + "objective/entropy": 29.59412956237793, + "objective/kl": 11.429637908935547, + "objective/non_score_reward": -0.5714819431304932, + "objective/rlhf_reward": -2.2859277576208115, + "objective/scores": 0.0, + "policy/approxkl_avg": 113.22151184082031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.642578125, + "step": 17, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986356496810913 + }, + { + "episode": 304, + "epoch": 0.0018214281434614326, + "loss/policy_avg": 0.1845007836818695, + "lr": 9.988496932515338e-06, + "objective/entropy": -2.3180160522460938, + "objective/kl": 15.66268539428711, + "objective/non_score_reward": -0.7831343412399292, + "objective/rlhf_reward": -1.773287498687191, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 162.00823974609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8046875, + "step": 18, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002946853637695 + }, + { + "episode": 320, + "epoch": 0.0019172927825909815, + "loss/policy_avg": 0.14623276889324188, + "lr": 9.987857873210635e-06, + "objective/entropy": 26.79373550415039, + "objective/kl": 16.199674606323242, + "objective/non_score_reward": -0.8099837303161621, + "objective/rlhf_reward": -3.2399348318576813, + "objective/scores": 0.0, + "policy/approxkl_avg": 345.71685791015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7109375, + "step": 19, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992010593414307 + }, + { + "episode": 336, + "epoch": 0.0020131574217205307, + "loss/policy_avg": 0.061316944658756256, + "lr": 9.987218813905932e-06, + "objective/entropy": 30.27604866027832, + "objective/kl": 13.349930763244629, + "objective/non_score_reward": -0.6674965620040894, + "objective/rlhf_reward": -4.669986248016357, + "objective/scores": -0.5, + "policy/approxkl_avg": 182.816650390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 20, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996535301208496 + }, + { + "episode": 352, + "epoch": 0.0021090220608500796, + "loss/policy_avg": -0.08272892981767654, + "lr": 9.986579754601228e-06, + "objective/entropy": 198.63003540039062, + "objective/kl": 11.5382719039917, + "objective/non_score_reward": -0.5769136548042297, + "objective/rlhf_reward": -0.9076545149087907, + "objective/scores": 0.35, + "policy/approxkl_avg": 159.97686767578125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.974609375, + "step": 21, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998477578163147 + }, + { + "episode": 368, + "epoch": 0.0022048866999796286, + "loss/policy_avg": 0.23813551664352417, + "lr": 9.985940695296524e-06, + "objective/entropy": 181.51829528808594, + "objective/kl": 13.08276653289795, + "objective/non_score_reward": -0.6541383266448975, + "objective/rlhf_reward": -4.61655330657959, + "objective/scores": -0.5, + "policy/approxkl_avg": 108.47281646728516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6484375, + "step": 22, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991295337677002 + }, + { + "episode": 384, + "epoch": 0.002300751339109178, + "loss/policy_avg": 0.36420387029647827, + "lr": 9.98530163599182e-06, + "objective/entropy": 257.93609619140625, + "objective/kl": 14.696407318115234, + "objective/non_score_reward": -0.7348203063011169, + "objective/rlhf_reward": -1.3351611531415755, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 150.4597625732422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.849609375, + "step": 23, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980016946792603 + }, + { + "episode": 400, + "epoch": 0.002396615978238727, + "loss/policy_avg": 0.20780539512634277, + "lr": 9.984662576687117e-06, + "objective/entropy": -139.27951049804688, + "objective/kl": 15.462644577026367, + "objective/non_score_reward": -0.77313232421875, + "objective/rlhf_reward": -5.092529296875, + "objective/scores": -0.5, + "policy/approxkl_avg": 237.78317260742188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 24, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999894618988037 + }, + { + "episode": 416, + "epoch": 0.002492480617368276, + "loss/policy_avg": 0.2547074556350708, + "lr": 9.984023517382414e-06, + "objective/entropy": 103.24639892578125, + "objective/kl": 17.307334899902344, + "objective/non_score_reward": -0.8653668165206909, + "objective/rlhf_reward": -5.461467266082764, + "objective/scores": -0.5, + "policy/approxkl_avg": 167.2418212890625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65625, + "step": 25, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000932216644287 + }, + { + "episode": 432, + "epoch": 0.002588345256497825, + "loss/policy_avg": 0.3379603624343872, + "lr": 9.983384458077711e-06, + "objective/entropy": 120.86388397216797, + "objective/kl": 14.275808334350586, + "objective/non_score_reward": -0.7137903571128845, + "objective/rlhf_reward": -2.8551614582538605, + "objective/scores": 0.0, + "policy/approxkl_avg": 99.34181213378906, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.73828125, + "step": 26, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994611740112305 + }, + { + "episode": 448, + "epoch": 0.0026842098956273742, + "loss/policy_avg": 0.1328231394290924, + "lr": 9.982745398773006e-06, + "objective/entropy": 154.86619567871094, + "objective/kl": 14.35202693939209, + "objective/non_score_reward": -0.7176014184951782, + "objective/rlhf_reward": -4.870405673980713, + "objective/scores": -0.5, + "policy/approxkl_avg": 91.33482360839844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.724609375, + "step": 27, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9957685470581055 + }, + { + "episode": 464, + "epoch": 0.002780074534756923, + "loss/policy_avg": 0.2314174473285675, + "lr": 9.982106339468303e-06, + "objective/entropy": 60.19127655029297, + "objective/kl": 12.188166618347168, + "objective/non_score_reward": -0.6094082593917847, + "objective/rlhf_reward": -2.4376331865787506, + "objective/scores": 0.0, + "policy/approxkl_avg": 205.1094970703125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.810546875, + "step": 28, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.994727611541748 + }, + { + "episode": 480, + "epoch": 0.002875939173886472, + "loss/policy_avg": 0.1261996328830719, + "lr": 9.9814672801636e-06, + "objective/entropy": 67.08200073242188, + "objective/kl": 16.607372283935547, + "objective/non_score_reward": -0.8303685784339905, + "objective/rlhf_reward": -3.3214742839336395, + "objective/scores": 0.0, + "policy/approxkl_avg": 226.6929168701172, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.53125, + "step": 29, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976527690887451 + }, + { + "episode": 496, + "epoch": 0.0029718038130160216, + "loss/policy_avg": 0.35239556431770325, + "lr": 9.980828220858897e-06, + "objective/entropy": 204.22116088867188, + "objective/kl": 14.144770622253418, + "objective/non_score_reward": -0.7072385549545288, + "objective/rlhf_reward": 1.5710457801818851, + "objective/scores": 1.1, + "policy/approxkl_avg": 124.6588363647461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7578125, + "step": 30, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977011680603027 + }, + { + "episode": 512, + "epoch": 0.0030676684521455705, + "loss/policy_avg": 0.26766547560691833, + "lr": 9.980189161554194e-06, + "objective/entropy": 107.69725036621094, + "objective/kl": 12.877479553222656, + "objective/non_score_reward": -0.643873929977417, + "objective/rlhf_reward": -2.5754958018660545, + "objective/scores": 0.0, + "policy/approxkl_avg": 186.40504455566406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.701171875, + "step": 31, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972684383392334 + }, + { + "episode": 528, + "epoch": 0.0031635330912751195, + "loss/policy_avg": 0.017455143854022026, + "lr": 9.97955010224949e-06, + "objective/entropy": 69.81261444091797, + "objective/kl": 15.58060073852539, + "objective/non_score_reward": -0.7790300250053406, + "objective/rlhf_reward": -0.19240116024133824, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 326.45733642578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 32, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994511604309082 + }, + { + "episode": 544, + "epoch": 0.0032593977304046684, + "loss/policy_avg": 0.1626880019903183, + "lr": 9.978911042944786e-06, + "objective/entropy": 49.244285583496094, + "objective/kl": 11.25068473815918, + "objective/non_score_reward": -0.5625342130661011, + "objective/rlhf_reward": -0.871534817901951, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 50.66204071044922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7109375, + "step": 33, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9966607093811035 + }, + { + "episode": 560, + "epoch": 0.003355262369534218, + "loss/policy_avg": 0.18031546473503113, + "lr": 9.978271983640083e-06, + "objective/entropy": 147.95474243164062, + "objective/kl": 15.950370788574219, + "objective/non_score_reward": -0.7975186109542847, + "objective/rlhf_reward": -3.190074533224106, + "objective/scores": 0.0, + "policy/approxkl_avg": 201.51080322265625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.515625, + "step": 34, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006275177001953 + }, + { + "episode": 576, + "epoch": 0.0034511270086637668, + "loss/policy_avg": -0.05689749866724014, + "lr": 9.977632924335378e-06, + "objective/entropy": 4.240108489990234, + "objective/kl": 13.38272762298584, + "objective/non_score_reward": -0.6691364049911499, + "objective/rlhf_reward": -4.6765456199646, + "objective/scores": -0.5, + "policy/approxkl_avg": 247.30409240722656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.796875, + "step": 35, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973469972610474 + }, + { + "episode": 592, + "epoch": 0.0035469916477933157, + "loss/policy_avg": 0.16461774706840515, + "lr": 9.976993865030675e-06, + "objective/entropy": 123.00151824951172, + "objective/kl": 11.21810531616211, + "objective/non_score_reward": -0.5609052181243896, + "objective/rlhf_reward": -4.243620872497559, + "objective/scores": -0.5, + "policy/approxkl_avg": 64.79019927978516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.712890625, + "step": 36, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0026960372924805 + }, + { + "episode": 608, + "epoch": 0.003642856286922865, + "loss/policy_avg": 0.15750062465667725, + "lr": 9.976354805725972e-06, + "objective/entropy": 30.60162925720215, + "objective/kl": 17.013538360595703, + "objective/non_score_reward": -0.850676953792572, + "objective/rlhf_reward": -3.4027078449726105, + "objective/scores": 0.0, + "policy/approxkl_avg": 234.95870971679688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7421875, + "step": 37, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998824119567871 + }, + { + "episode": 624, + "epoch": 0.003738720926052414, + "loss/policy_avg": 0.03158241882920265, + "lr": 9.975715746421269e-06, + "objective/entropy": 115.71566772460938, + "objective/kl": 15.653677940368652, + "objective/non_score_reward": -0.7826838493347168, + "objective/rlhf_reward": -3.1307354420423508, + "objective/scores": 0.0, + "policy/approxkl_avg": 233.744873046875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.765625, + "step": 38, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997577428817749 + }, + { + "episode": 640, + "epoch": 0.003834585565181963, + "loss/policy_avg": -0.031586866825819016, + "lr": 9.975076687116566e-06, + "objective/entropy": 70.69473266601562, + "objective/kl": 13.526529312133789, + "objective/non_score_reward": -0.6763265132904053, + "objective/rlhf_reward": 1.6946939915418628, + "objective/scores": 1.1, + "policy/approxkl_avg": 101.47872924804688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.677734375, + "step": 39, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0043745040893555 + }, + { + "episode": 656, + "epoch": 0.003930450204311512, + "loss/policy_avg": 0.12032957375049591, + "lr": 9.97443762781186e-06, + "objective/entropy": 172.61669921875, + "objective/kl": 16.604652404785156, + "objective/non_score_reward": -0.8302326798439026, + "objective/rlhf_reward": -5.320930480957031, + "objective/scores": -0.5, + "policy/approxkl_avg": 220.98178100585938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.642578125, + "step": 40, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9957454204559326 + }, + { + "episode": 672, + "epoch": 0.004026314843441061, + "loss/policy_avg": 0.0876859575510025, + "lr": 9.973798568507158e-06, + "objective/entropy": 12.816411972045898, + "objective/kl": 13.908916473388672, + "objective/non_score_reward": -0.6954457759857178, + "objective/rlhf_reward": -4.781783103942871, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.01300048828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.640625, + "step": 41, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0018563270568848 + }, + { + "episode": 688, + "epoch": 0.00412217948257061, + "loss/policy_avg": 0.1008758619427681, + "lr": 9.973159509202454e-06, + "objective/entropy": 257.1292724609375, + "objective/kl": 11.528783798217773, + "objective/non_score_reward": -0.5764391422271729, + "objective/rlhf_reward": -2.3057566583156586, + "objective/scores": 0.0, + "policy/approxkl_avg": 84.59580993652344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.75390625, + "step": 42, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9929132461547852 + }, + { + "episode": 704, + "epoch": 0.004218044121700159, + "loss/policy_avg": 0.44639891386032104, + "lr": 9.972520449897751e-06, + "objective/entropy": 62.445350646972656, + "objective/kl": 13.397602081298828, + "objective/non_score_reward": -0.6698801517486572, + "objective/rlhf_reward": -4.679520606994629, + "objective/scores": -0.5, + "policy/approxkl_avg": 185.67079162597656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.533203125, + "step": 43, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992148876190186 + }, + { + "episode": 720, + "epoch": 0.004313908760829708, + "loss/policy_avg": 0.09568839520215988, + "lr": 9.971881390593048e-06, + "objective/entropy": 129.84619140625, + "objective/kl": 14.350381851196289, + "objective/non_score_reward": -0.7175191640853882, + "objective/rlhf_reward": -0.9226653081940968, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 253.10037231445312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 44, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9957365989685059 + }, + { + "episode": 736, + "epoch": 0.004409773399959257, + "loss/policy_avg": 0.22084593772888184, + "lr": 9.971242331288345e-06, + "objective/entropy": -42.89992904663086, + "objective/kl": 15.893115997314453, + "objective/non_score_reward": -0.7946557998657227, + "objective/rlhf_reward": -1.44528977672259, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 121.7098388671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.751953125, + "step": 45, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000582218170166 + }, + { + "episode": 752, + "epoch": 0.004505638039088807, + "loss/policy_avg": 0.22875869274139404, + "lr": 9.97060327198364e-06, + "objective/entropy": 9.025165557861328, + "objective/kl": 22.01996612548828, + "objective/non_score_reward": -1.1009982824325562, + "objective/rlhf_reward": -1.48027405583975, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 321.845703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5859375, + "step": 46, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9953069686889648 + }, + { + "episode": 768, + "epoch": 0.004601502678218356, + "loss/policy_avg": 0.07661572843790054, + "lr": 9.969964212678937e-06, + "objective/entropy": 10.382087707519531, + "objective/kl": 13.358439445495605, + "objective/non_score_reward": -0.6679220199584961, + "objective/rlhf_reward": 1.7283119499683384, + "objective/scores": 1.1, + "policy/approxkl_avg": 29.50304412841797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.638671875, + "step": 47, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999579668045044 + }, + { + "episode": 784, + "epoch": 0.004697367317347905, + "loss/policy_avg": 0.19636262953281403, + "lr": 9.969325153374234e-06, + "objective/entropy": 0.6832618713378906, + "objective/kl": 8.836541175842285, + "objective/non_score_reward": -0.4418269991874695, + "objective/rlhf_reward": -1.767308071255684, + "objective/scores": 0.0, + "policy/approxkl_avg": 94.3209228515625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.712890625, + "step": 48, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998420000076294 + }, + { + "episode": 800, + "epoch": 0.004793231956477454, + "loss/policy_avg": -0.02652953751385212, + "lr": 9.968686094069531e-06, + "objective/entropy": 125.6042709350586, + "objective/kl": 15.016199111938477, + "objective/non_score_reward": -0.7508100271224976, + "objective/rlhf_reward": -5.00324010848999, + "objective/scores": -0.5, + "policy/approxkl_avg": 207.12213134765625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7421875, + "step": 49, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002742052078247 + }, + { + "episode": 816, + "epoch": 0.004889096595607003, + "loss/policy_avg": 0.10039197653532028, + "lr": 9.968047034764828e-06, + "objective/entropy": -24.506595611572266, + "objective/kl": 18.726213455200195, + "objective/non_score_reward": -0.9363107085227966, + "objective/rlhf_reward": -5.745243072509766, + "objective/scores": -0.5, + "policy/approxkl_avg": 300.677490234375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6953125, + "step": 50, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997317790985107 + }, + { + "episode": 832, + "epoch": 0.004984961234736552, + "loss/policy_avg": 0.18666991591453552, + "lr": 9.967407975460123e-06, + "objective/entropy": -47.62429428100586, + "objective/kl": 13.258740425109863, + "objective/non_score_reward": -0.6629370450973511, + "objective/rlhf_reward": -1.0954889049201753, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 110.42059326171875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.685546875, + "step": 51, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9965816736221313 + }, + { + "episode": 848, + "epoch": 0.005080825873866101, + "loss/policy_avg": 0.195734903216362, + "lr": 9.96676891615542e-06, + "objective/entropy": 57.396114349365234, + "objective/kl": 15.980720520019531, + "objective/non_score_reward": -0.7990360856056213, + "objective/rlhf_reward": -5.196144104003906, + "objective/scores": -0.5, + "policy/approxkl_avg": 125.00595092773438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.775390625, + "step": 52, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.995786428451538 + }, + { + "episode": 864, + "epoch": 0.00517669051299565, + "loss/policy_avg": 0.24296848475933075, + "lr": 9.966129856850717e-06, + "objective/entropy": 99.57502746582031, + "objective/kl": 17.62392807006836, + "objective/non_score_reward": -0.8811964988708496, + "objective/rlhf_reward": -2.0090143916928134, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 99.32807922363281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.662109375, + "step": 53, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0016443729400635 + }, + { + "episode": 880, + "epoch": 0.0052725551521251995, + "loss/policy_avg": 0.5130509734153748, + "lr": 9.965490797546014e-06, + "objective/entropy": 34.0892219543457, + "objective/kl": 14.999124526977539, + "objective/non_score_reward": -0.7499562501907349, + "objective/rlhf_reward": -1.4840530840479695, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 130.593017578125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.75390625, + "step": 54, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000711679458618 + }, + { + "episode": 896, + "epoch": 0.0053684197912547485, + "loss/policy_avg": 0.3232521116733551, + "lr": 9.96485173824131e-06, + "objective/entropy": 69.26298522949219, + "objective/kl": 21.724315643310547, + "objective/non_score_reward": -1.086215853691101, + "objective/rlhf_reward": -2.788603871074274, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 234.98104858398438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 55, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9952917098999023 + }, + { + "episode": 912, + "epoch": 0.0054642844303842975, + "loss/policy_avg": 0.10791392624378204, + "lr": 9.964212678936606e-06, + "objective/entropy": 32.22584533691406, + "objective/kl": 15.846414566040039, + "objective/non_score_reward": -0.7923207879066467, + "objective/rlhf_reward": 1.230716893076897, + "objective/scores": 1.1, + "policy/approxkl_avg": 256.9724426269531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4736328125, + "step": 56, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984806776046753 + }, + { + "episode": 928, + "epoch": 0.005560149069513846, + "loss/policy_avg": 0.37206730246543884, + "lr": 9.963573619631903e-06, + "objective/entropy": -6.195688247680664, + "objective/kl": 12.801559448242188, + "objective/non_score_reward": -0.6400780081748962, + "objective/rlhf_reward": -2.5603120028972626, + "objective/scores": 0.0, + "policy/approxkl_avg": 40.83631896972656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.744140625, + "step": 57, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0007452964782715 + }, + { + "episode": 944, + "epoch": 0.005656013708643395, + "loss/policy_avg": 0.05091024935245514, + "lr": 9.9629345603272e-06, + "objective/entropy": -18.476280212402344, + "objective/kl": 18.95052719116211, + "objective/non_score_reward": -0.9475262761116028, + "objective/rlhf_reward": -3.790105164051056, + "objective/scores": 0.0, + "policy/approxkl_avg": 144.4001007080078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.580078125, + "step": 58, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988869428634644 + }, + { + "episode": 960, + "epoch": 0.005751878347772944, + "loss/policy_avg": -0.008312445133924484, + "lr": 9.962295501022495e-06, + "objective/entropy": 217.08169555664062, + "objective/kl": 14.908738136291504, + "objective/non_score_reward": -0.7454369068145752, + "objective/rlhf_reward": -2.9817477762699127, + "objective/scores": 0.0, + "policy/approxkl_avg": 93.395751953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.77734375, + "step": 59, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973504543304443 + }, + { + "episode": 976, + "epoch": 0.005847742986902493, + "loss/policy_avg": 0.03407389298081398, + "lr": 9.961656441717792e-06, + "objective/entropy": 140.58189392089844, + "objective/kl": 20.377920150756836, + "objective/non_score_reward": -1.0188961029052734, + "objective/rlhf_reward": -6.075584411621094, + "objective/scores": -0.5, + "policy/approxkl_avg": 66.34793090820312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.673828125, + "step": 60, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996695518493652 + }, + { + "episode": 992, + "epoch": 0.005943607626032043, + "loss/policy_avg": 0.08645053207874298, + "lr": 9.961017382413088e-06, + "objective/entropy": 19.108230590820312, + "objective/kl": 13.644828796386719, + "objective/non_score_reward": -0.6822414994239807, + "objective/rlhf_reward": -2.7289658784866333, + "objective/scores": 0.0, + "policy/approxkl_avg": 35.71690368652344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.564453125, + "step": 61, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9962527751922607 + }, + { + "episode": 1008, + "epoch": 0.006039472265161592, + "loss/policy_avg": -0.17965860664844513, + "lr": 9.960378323108385e-06, + "objective/entropy": 168.96075439453125, + "objective/kl": 11.691057205200195, + "objective/non_score_reward": -0.5845528841018677, + "objective/rlhf_reward": -4.338212013244629, + "objective/scores": -0.5, + "policy/approxkl_avg": 65.8020248413086, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.775390625, + "step": 62, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0171313285827637 + }, + { + "episode": 1024, + "epoch": 0.006135336904291141, + "loss/policy_avg": 0.061459362506866455, + "lr": 9.959739263803682e-06, + "objective/entropy": 117.6607437133789, + "objective/kl": 15.35727310180664, + "objective/non_score_reward": -0.7678636312484741, + "objective/rlhf_reward": -3.071454644203186, + "objective/scores": 0.0, + "policy/approxkl_avg": 187.7376708984375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9375, + "step": 63, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990172386169434 + }, + { + "episode": 1040, + "epoch": 0.00623120154342069, + "loss/policy_avg": 0.07200516015291214, + "lr": 9.959100204498979e-06, + "objective/entropy": 20.476089477539062, + "objective/kl": 13.475000381469727, + "objective/non_score_reward": -0.6737500429153442, + "objective/rlhf_reward": 1.7049996197223667, + "objective/scores": 1.1, + "policy/approxkl_avg": 68.92333984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6796875, + "step": 64, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985809326171875 + }, + { + "episode": 1056, + "epoch": 0.006327066182550239, + "loss/policy_avg": 0.08365275710821152, + "lr": 9.958461145194274e-06, + "objective/entropy": -127.62371826171875, + "objective/kl": 22.050678253173828, + "objective/non_score_reward": -1.1025339365005493, + "objective/rlhf_reward": -4.410135626792908, + "objective/scores": 0.0, + "policy/approxkl_avg": 301.7841491699219, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.720703125, + "step": 65, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990687370300293 + }, + { + "episode": 1072, + "epoch": 0.006422930821679788, + "loss/policy_avg": 0.040758199989795685, + "lr": 9.957822085889571e-06, + "objective/entropy": 70.77458190917969, + "objective/kl": 18.2130069732666, + "objective/non_score_reward": -0.9106502532958984, + "objective/rlhf_reward": -1.9807416550522907, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 190.14797973632812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.439453125, + "step": 66, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984623193740845 + }, + { + "episode": 1088, + "epoch": 0.006518795460809337, + "loss/policy_avg": 0.05295582860708237, + "lr": 9.957183026584868e-06, + "objective/entropy": 97.35667419433594, + "objective/kl": 24.64842987060547, + "objective/non_score_reward": -1.2324215173721313, + "objective/rlhf_reward": -4.929685860872269, + "objective/scores": 0.0, + "policy/approxkl_avg": 394.2406921386719, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.67578125, + "step": 67, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0007405281066895 + }, + { + "episode": 1104, + "epoch": 0.006614660099938887, + "loss/policy_avg": 0.14266067743301392, + "lr": 9.956543967280165e-06, + "objective/entropy": 85.57185363769531, + "objective/kl": 14.76464557647705, + "objective/non_score_reward": -0.7382322549819946, + "objective/rlhf_reward": -0.029210095049115647, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 171.19406127929688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 68, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9964725971221924 + }, + { + "episode": 1120, + "epoch": 0.006710524739068436, + "loss/policy_avg": 0.11469551920890808, + "lr": 9.955904907975462e-06, + "objective/entropy": 21.974023818969727, + "objective/kl": 16.630640029907227, + "objective/non_score_reward": -0.8315319418907166, + "objective/rlhf_reward": -3.326127827167511, + "objective/scores": 0.0, + "policy/approxkl_avg": 63.985679626464844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.72265625, + "step": 69, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999800682067871 + }, + { + "episode": 1136, + "epoch": 0.006806389378197985, + "loss/policy_avg": 0.10287429392337799, + "lr": 9.955265848670757e-06, + "objective/entropy": 43.38239288330078, + "objective/kl": 22.02418327331543, + "objective/non_score_reward": -1.101209282875061, + "objective/rlhf_reward": -4.404837071895599, + "objective/scores": 0.0, + "policy/approxkl_avg": 135.89984130859375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69140625, + "step": 70, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973747730255127 + }, + { + "episode": 1152, + "epoch": 0.0069022540173275335, + "loss/policy_avg": 2.0731570720672607, + "lr": 9.954626789366054e-06, + "objective/entropy": -7.300925254821777, + "objective/kl": 17.635089874267578, + "objective/non_score_reward": -0.8817545175552368, + "objective/rlhf_reward": -3.5270181000232697, + "objective/scores": 0.0, + "policy/approxkl_avg": 183.6417236328125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65625, + "step": 71, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9976426362991333 + }, + { + "episode": 1168, + "epoch": 0.0069981186564570825, + "loss/policy_avg": 0.24466943740844727, + "lr": 9.95398773006135e-06, + "objective/entropy": -26.054595947265625, + "objective/kl": 15.905699729919434, + "objective/non_score_reward": -0.7952849864959717, + "objective/rlhf_reward": -5.181139945983887, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.05584716796875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 72, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9955050945281982 + }, + { + "episode": 1184, + "epoch": 0.0070939832955866314, + "loss/policy_avg": 0.4031391739845276, + "lr": 9.953348670756648e-06, + "objective/entropy": -107.55976867675781, + "objective/kl": 19.68102264404297, + "objective/non_score_reward": -0.9840512275695801, + "objective/rlhf_reward": -3.936204746365547, + "objective/scores": 0.0, + "policy/approxkl_avg": 232.6634521484375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.751953125, + "step": 73, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.00066876411438 + }, + { + "episode": 1200, + "epoch": 0.00718984793471618, + "loss/policy_avg": 0.1890699565410614, + "lr": 9.952709611451944e-06, + "objective/entropy": 118.900146484375, + "objective/kl": 21.680133819580078, + "objective/non_score_reward": -1.084006667137146, + "objective/rlhf_reward": -6.336027145385742, + "objective/scores": -0.5, + "policy/approxkl_avg": 266.20465087890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.720703125, + "step": 74, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997557282447815 + }, + { + "episode": 1216, + "epoch": 0.00728571257384573, + "loss/policy_avg": 0.383888304233551, + "lr": 9.952070552147241e-06, + "objective/entropy": 124.33120727539062, + "objective/kl": 21.27002716064453, + "objective/non_score_reward": -1.0635013580322266, + "objective/rlhf_reward": -4.254005193710327, + "objective/scores": 0.0, + "policy/approxkl_avg": 86.44483184814453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.611328125, + "step": 75, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9962941408157349 + }, + { + "episode": 1232, + "epoch": 0.007381577212975279, + "loss/policy_avg": 0.23960661888122559, + "lr": 9.951431492842536e-06, + "objective/entropy": 40.334468841552734, + "objective/kl": 17.827497482299805, + "objective/non_score_reward": -0.891374945640564, + "objective/rlhf_reward": -3.5654996633529663, + "objective/scores": 0.0, + "policy/approxkl_avg": 94.15713500976562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.548828125, + "step": 76, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984164237976074 + }, + { + "episode": 1248, + "epoch": 0.007477441852104828, + "loss/policy_avg": 0.4706483781337738, + "lr": 9.950792433537833e-06, + "objective/entropy": 106.07322692871094, + "objective/kl": 19.12630844116211, + "objective/non_score_reward": -0.9563154578208923, + "objective/rlhf_reward": -3.8252618312835693, + "objective/scores": 0.0, + "policy/approxkl_avg": 185.7378387451172, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.87890625, + "step": 77, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9966304302215576 + }, + { + "episode": 1264, + "epoch": 0.007573306491234377, + "loss/policy_avg": 0.0665474385023117, + "lr": 9.950153374233129e-06, + "objective/entropy": 34.984527587890625, + "objective/kl": 23.865880966186523, + "objective/non_score_reward": -1.1932940483093262, + "objective/rlhf_reward": -3.373176074028015, + "objective/scores": 0.35, + "policy/approxkl_avg": 336.36712646484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 78, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997634768486023 + }, + { + "episode": 1280, + "epoch": 0.007669171130363926, + "loss/policy_avg": 0.3842596113681793, + "lr": 9.949514314928425e-06, + "objective/entropy": 229.88047790527344, + "objective/kl": 27.72378921508789, + "objective/non_score_reward": -1.386189579963684, + "objective/rlhf_reward": -1.144758558273315, + "objective/scores": 1.1, + "policy/approxkl_avg": 524.0328369140625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.720703125, + "step": 79, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997296690940857 + }, + { + "episode": 1296, + "epoch": 0.007765035769493475, + "loss/policy_avg": 0.4347228705883026, + "lr": 9.948875255623722e-06, + "objective/entropy": -43.351566314697266, + "objective/kl": 18.37939453125, + "objective/non_score_reward": -0.9189697504043579, + "objective/rlhf_reward": -3.6758789718151093, + "objective/scores": 0.0, + "policy/approxkl_avg": 87.53759002685547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.521484375, + "step": 80, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968631267547607 + }, + { + "episode": 1312, + "epoch": 0.007860900408623025, + "loss/policy_avg": 0.5703809261322021, + "lr": 9.94823619631902e-06, + "objective/entropy": 182.94879150390625, + "objective/kl": 24.6871337890625, + "objective/non_score_reward": -1.2343567609786987, + "objective/rlhf_reward": -6.937427043914795, + "objective/scores": -0.5, + "policy/approxkl_avg": 274.44744873046875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71875, + "step": 81, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995486736297607 + }, + { + "episode": 1328, + "epoch": 0.007956765047752574, + "loss/policy_avg": 0.10641711950302124, + "lr": 9.947597137014316e-06, + "objective/entropy": 73.29893493652344, + "objective/kl": 17.603548049926758, + "objective/non_score_reward": -0.88017737865448, + "objective/rlhf_reward": -3.52070951461792, + "objective/scores": 0.0, + "policy/approxkl_avg": 123.0771255493164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.505859375, + "step": 82, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998863935470581 + }, + { + "episode": 1344, + "epoch": 0.008052629686882123, + "loss/policy_avg": 0.12928956747055054, + "lr": 9.946958077709611e-06, + "objective/entropy": 108.6548080444336, + "objective/kl": 18.59684944152832, + "objective/non_score_reward": -0.929842472076416, + "objective/rlhf_reward": -2.3407678390420497, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 120.68421936035156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 83, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998254656791687 + }, + { + "episode": 1360, + "epoch": 0.008148494326011672, + "loss/policy_avg": 0.14865761995315552, + "lr": 9.946319018404908e-06, + "objective/entropy": 102.67412567138672, + "objective/kl": 23.651020050048828, + "objective/non_score_reward": -1.1825510263442993, + "objective/rlhf_reward": -4.7302040457725525, + "objective/scores": 0.0, + "policy/approxkl_avg": 73.66981506347656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4033203125, + "step": 84, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000584602355957 + }, + { + "episode": 1376, + "epoch": 0.00824435896514122, + "loss/policy_avg": 0.07400541007518768, + "lr": 9.945679959100205e-06, + "objective/entropy": 133.18292236328125, + "objective/kl": 11.912694931030273, + "objective/non_score_reward": -0.5956346988677979, + "objective/rlhf_reward": -0.4351277453469593, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 49.34624099731445, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.59375, + "step": 85, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000187397003174 + }, + { + "episode": 1392, + "epoch": 0.00834022360427077, + "loss/policy_avg": 0.13357847929000854, + "lr": 9.945040899795502e-06, + "objective/entropy": 112.34770202636719, + "objective/kl": 20.725894927978516, + "objective/non_score_reward": -1.0362948179244995, + "objective/rlhf_reward": -2.7665772224343836, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 157.26473999023438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.56640625, + "step": 86, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9963226318359375 + }, + { + "episode": 1408, + "epoch": 0.008436088243400319, + "loss/policy_avg": 0.32753437757492065, + "lr": 9.944401840490799e-06, + "objective/entropy": 43.2598762512207, + "objective/kl": 19.98666000366211, + "objective/non_score_reward": -0.9993331432342529, + "objective/rlhf_reward": -5.997332572937012, + "objective/scores": -0.5, + "policy/approxkl_avg": 257.4547424316406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.599609375, + "step": 87, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9966068267822266 + }, + { + "episode": 1424, + "epoch": 0.008531952882529868, + "loss/policy_avg": 0.09795168787240982, + "lr": 9.943762781186096e-06, + "objective/entropy": -59.364646911621094, + "objective/kl": 14.953709602355957, + "objective/non_score_reward": -0.7476855516433716, + "objective/rlhf_reward": -4.990742206573486, + "objective/scores": -0.5, + "policy/approxkl_avg": 55.110633850097656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 88, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9969828128814697 + }, + { + "episode": 1440, + "epoch": 0.008627817521659416, + "loss/policy_avg": 0.06303240358829498, + "lr": 9.94312372188139e-06, + "objective/entropy": 50.4556770324707, + "objective/kl": 19.505146026611328, + "objective/non_score_reward": -0.9752573370933533, + "objective/rlhf_reward": -3.9010292887687683, + "objective/scores": 0.0, + "policy/approxkl_avg": 34.922752380371094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.607421875, + "step": 89, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9964232444763184 + }, + { + "episode": 1456, + "epoch": 0.008723682160788965, + "loss/policy_avg": 0.2796894907951355, + "lr": 9.942484662576688e-06, + "objective/entropy": 135.44993591308594, + "objective/kl": 22.230022430419922, + "objective/non_score_reward": -1.1115009784698486, + "objective/rlhf_reward": -4.446004092693329, + "objective/scores": 0.0, + "policy/approxkl_avg": 48.1524658203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 90, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998495101928711 + }, + { + "episode": 1472, + "epoch": 0.008819546799918514, + "loss/policy_avg": 0.12483496963977814, + "lr": 9.941845603271985e-06, + "objective/entropy": 148.17709350585938, + "objective/kl": 17.60011100769043, + "objective/non_score_reward": -0.8800055384635925, + "objective/rlhf_reward": -5.520022392272949, + "objective/scores": -0.5, + "policy/approxkl_avg": 142.25204467773438, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6171875, + "step": 91, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998748779296875 + }, + { + "episode": 1488, + "epoch": 0.008915411439048063, + "loss/policy_avg": 0.0684453696012497, + "lr": 9.941206543967281e-06, + "objective/entropy": 25.60771942138672, + "objective/kl": 17.530319213867188, + "objective/non_score_reward": -0.8765159845352173, + "objective/rlhf_reward": -5.506063938140869, + "objective/scores": -0.5, + "policy/approxkl_avg": 70.80863189697266, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.625, + "step": 92, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999198913574219 + }, + { + "episode": 1504, + "epoch": 0.009011276078177614, + "loss/policy_avg": 0.13488999009132385, + "lr": 9.940567484662578e-06, + "objective/entropy": -75.2538070678711, + "objective/kl": 19.696504592895508, + "objective/non_score_reward": -0.9848252534866333, + "objective/rlhf_reward": -3.939300984144211, + "objective/scores": 0.0, + "policy/approxkl_avg": 214.8182373046875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.515625, + "step": 93, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989702701568604 + }, + { + "episode": 1520, + "epoch": 0.009107140717307163, + "loss/policy_avg": 0.02409663423895836, + "lr": 9.939928425357874e-06, + "objective/entropy": 8.831840515136719, + "objective/kl": 25.456069946289062, + "objective/non_score_reward": -1.272803544998169, + "objective/rlhf_reward": -3.5754421589695777, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 176.86953735351562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4873046875, + "step": 94, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9964553117752075 + }, + { + "episode": 1536, + "epoch": 0.009203005356436712, + "loss/policy_avg": 0.0426328219473362, + "lr": 9.93928936605317e-06, + "objective/entropy": 185.92372131347656, + "objective/kl": 19.176239013671875, + "objective/non_score_reward": -0.95881187915802, + "objective/rlhf_reward": -3.8352474570274353, + "objective/scores": 0.0, + "policy/approxkl_avg": 269.95849609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 95, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9930847883224487 + }, + { + "episode": 1552, + "epoch": 0.009298869995566261, + "loss/policy_avg": 0.3135666251182556, + "lr": 9.938650306748467e-06, + "objective/entropy": -119.88722229003906, + "objective/kl": 18.911632537841797, + "objective/non_score_reward": -0.9455816745758057, + "objective/rlhf_reward": -3.782326579093933, + "objective/scores": 0.0, + "policy/approxkl_avg": 136.56689453125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.630859375, + "step": 96, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0003180503845215 + }, + { + "episode": 1568, + "epoch": 0.00939473463469581, + "loss/policy_avg": 0.1893162876367569, + "lr": 9.938011247443764e-06, + "objective/entropy": 179.721435546875, + "objective/kl": 21.26153564453125, + "objective/non_score_reward": -1.0630767345428467, + "objective/rlhf_reward": -1.3285883411180701, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 203.61773681640625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.70703125, + "step": 97, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.994171142578125 + }, + { + "episode": 1584, + "epoch": 0.009490599273825359, + "loss/policy_avg": 0.5632504224777222, + "lr": 9.937372188139061e-06, + "objective/entropy": 3.3514366149902344, + "objective/kl": 19.21142578125, + "objective/non_score_reward": -0.9605712890625, + "objective/rlhf_reward": -2.483035289977474, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 74.39619445800781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6953125, + "step": 98, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99905526638031 + }, + { + "episode": 1600, + "epoch": 0.009586463912954908, + "loss/policy_avg": 0.20837292075157166, + "lr": 9.936733128834358e-06, + "objective/entropy": 121.03665161132812, + "objective/kl": 13.999438285827637, + "objective/non_score_reward": -0.6999719142913818, + "objective/rlhf_reward": -4.799887657165527, + "objective/scores": -0.5, + "policy/approxkl_avg": 32.508689880371094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.375, + "step": 99, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994678497314453 + }, + { + "episode": 1616, + "epoch": 0.009682328552084457, + "loss/policy_avg": 0.2726283669471741, + "lr": 9.936094069529653e-06, + "objective/entropy": 110.09475708007812, + "objective/kl": 15.960447311401367, + "objective/non_score_reward": -0.798022449016571, + "objective/rlhf_reward": -3.192089796066284, + "objective/scores": 0.0, + "policy/approxkl_avg": 61.767425537109375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.55859375, + "step": 100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0027127265930176 + }, + { + "episode": 1632, + "epoch": 0.009778193191214006, + "loss/policy_avg": 0.2845292091369629, + "lr": 9.93545501022495e-06, + "objective/entropy": -153.4110107421875, + "objective/kl": 16.05643081665039, + "objective/non_score_reward": -0.8028215765953064, + "objective/rlhf_reward": 1.1887137234210972, + "objective/scores": 1.1, + "policy/approxkl_avg": 95.32630157470703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.669921875, + "step": 101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997380256652832 + }, + { + "episode": 1648, + "epoch": 0.009874057830343555, + "loss/policy_avg": 0.18602727353572845, + "lr": 9.934815950920245e-06, + "objective/entropy": -13.683324813842773, + "objective/kl": 23.494054794311523, + "objective/non_score_reward": -1.174702763557434, + "objective/rlhf_reward": -6.698811054229736, + "objective/scores": -0.5, + "policy/approxkl_avg": 145.7095947265625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4912109375, + "step": 102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994981288909912 + }, + { + "episode": 1664, + "epoch": 0.009969922469473104, + "loss/policy_avg": 0.2709546983242035, + "lr": 9.934176891615542e-06, + "objective/entropy": -101.46907043457031, + "objective/kl": 22.274028778076172, + "objective/non_score_reward": -1.113701581954956, + "objective/rlhf_reward": -2.507394979672368, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 308.6561584472656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4912109375, + "step": 103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9981049299240112 + }, + { + "episode": 1680, + "epoch": 0.010065787108602653, + "loss/policy_avg": 0.0334465391933918, + "lr": 9.933537832310839e-06, + "objective/entropy": 130.1453857421875, + "objective/kl": 29.715213775634766, + "objective/non_score_reward": -1.4857605695724487, + "objective/rlhf_reward": -4.601406863241821, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 351.42138671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5234375, + "step": 104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998427152633667 + }, + { + "episode": 1696, + "epoch": 0.010161651747732202, + "loss/policy_avg": 0.24828088283538818, + "lr": 9.932898773006136e-06, + "objective/entropy": 125.98516082763672, + "objective/kl": 15.857706069946289, + "objective/non_score_reward": -0.7928853034973145, + "objective/rlhf_reward": -1.720943163247451, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 74.20083618164062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.578125, + "step": 105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993948936462402 + }, + { + "episode": 1712, + "epoch": 0.01025751638686175, + "loss/policy_avg": 0.2954871356487274, + "lr": 9.932259713701433e-06, + "objective/entropy": 97.68868255615234, + "objective/kl": 12.135580062866211, + "objective/non_score_reward": -0.6067790389060974, + "objective/rlhf_reward": -2.4271161258220673, + "objective/scores": 0.0, + "policy/approxkl_avg": 67.66595458984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992032051086426 + }, + { + "episode": 1728, + "epoch": 0.0103533810259913, + "loss/policy_avg": 0.10418711602687836, + "lr": 9.931620654396728e-06, + "objective/entropy": -43.816890716552734, + "objective/kl": 19.110689163208008, + "objective/non_score_reward": -0.9555345773696899, + "objective/rlhf_reward": -5.82213830947876, + "objective/scores": -0.5, + "policy/approxkl_avg": 160.15283203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9975645542144775 + }, + { + "episode": 1744, + "epoch": 0.01044924566512085, + "loss/policy_avg": 0.23229390382766724, + "lr": 9.930981595092025e-06, + "objective/entropy": 91.57461547851562, + "objective/kl": 18.9378662109375, + "objective/non_score_reward": -0.9468932747840881, + "objective/rlhf_reward": -3.787573218345642, + "objective/scores": 0.0, + "policy/approxkl_avg": 155.35989379882812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9963436126708984 + }, + { + "episode": 1760, + "epoch": 0.010545110304250399, + "loss/policy_avg": 0.3382238447666168, + "lr": 9.930342535787322e-06, + "objective/entropy": -49.52970886230469, + "objective/kl": 17.919204711914062, + "objective/non_score_reward": -0.89596027135849, + "objective/rlhf_reward": -5.583841323852539, + "objective/scores": -0.5, + "policy/approxkl_avg": 151.97140502929688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73828125, + "step": 109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9960722923278809 + }, + { + "episode": 1776, + "epoch": 0.010640974943379948, + "loss/policy_avg": 0.16102033853530884, + "lr": 9.929703476482619e-06, + "objective/entropy": -40.16828155517578, + "objective/kl": 15.826179504394531, + "objective/non_score_reward": -0.7913089990615845, + "objective/rlhf_reward": -3.1652360260486603, + "objective/scores": 0.0, + "policy/approxkl_avg": 32.21597671508789, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.767578125, + "step": 110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9961647987365723 + }, + { + "episode": 1792, + "epoch": 0.010736839582509497, + "loss/policy_avg": 0.08855805546045303, + "lr": 9.929064417177915e-06, + "objective/entropy": 187.74282836914062, + "objective/kl": 22.12034797668457, + "objective/non_score_reward": -1.1060173511505127, + "objective/rlhf_reward": -6.424069404602051, + "objective/scores": -0.5, + "policy/approxkl_avg": 89.00162506103516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.80859375, + "step": 111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9971915483474731 + }, + { + "episode": 1808, + "epoch": 0.010832704221639046, + "loss/policy_avg": 0.3315132260322571, + "lr": 9.928425357873212e-06, + "objective/entropy": -130.41551208496094, + "objective/kl": 20.600021362304688, + "objective/non_score_reward": -1.030001163482666, + "objective/rlhf_reward": -2.7414021278298915, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 231.54774475097656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009350776672363 + }, + { + "episode": 1824, + "epoch": 0.010928568860768595, + "loss/policy_avg": 0.5379164814949036, + "lr": 9.927786298568507e-06, + "objective/entropy": 122.76021575927734, + "objective/kl": 24.667219161987305, + "objective/non_score_reward": -1.2333608865737915, + "objective/rlhf_reward": -6.933443546295166, + "objective/scores": -0.5, + "policy/approxkl_avg": 214.82647705078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4638671875, + "step": 113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977924823760986 + }, + { + "episode": 1840, + "epoch": 0.011024433499898144, + "loss/policy_avg": 0.19039300084114075, + "lr": 9.927147239263804e-06, + "objective/entropy": -26.283668518066406, + "objective/kl": 21.043611526489258, + "objective/non_score_reward": -1.0521806478500366, + "objective/rlhf_reward": -2.6929507491909828, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 337.57025146484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9954912662506104 + }, + { + "episode": 1856, + "epoch": 0.011120298139027693, + "loss/policy_avg": 0.030586296692490578, + "lr": 9.926508179959101e-06, + "objective/entropy": 189.2314910888672, + "objective/kl": 18.47957992553711, + "objective/non_score_reward": -0.9239791035652161, + "objective/rlhf_reward": -3.6959164142608643, + "objective/scores": 0.0, + "policy/approxkl_avg": 158.6993865966797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8671875, + "step": 115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987783432006836 + }, + { + "episode": 1872, + "epoch": 0.011216162778157242, + "loss/policy_avg": 0.23665881156921387, + "lr": 9.925869120654398e-06, + "objective/entropy": 73.73204803466797, + "objective/kl": 19.052127838134766, + "objective/non_score_reward": -0.9526063203811646, + "objective/rlhf_reward": -5.810425281524658, + "objective/scores": -0.5, + "policy/approxkl_avg": 45.58797073364258, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55859375, + "step": 116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978525638580322 + }, + { + "episode": 1888, + "epoch": 0.01131202741728679, + "loss/policy_avg": 0.11659398674964905, + "lr": 9.925230061349695e-06, + "objective/entropy": 128.39474487304688, + "objective/kl": 25.3045597076416, + "objective/non_score_reward": -1.265228033065796, + "objective/rlhf_reward": -2.6609121322631832, + "objective/scores": 0.6, + "policy/approxkl_avg": 76.14613342285156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.64453125, + "step": 117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9966604709625244 + }, + { + "episode": 1904, + "epoch": 0.01140789205641634, + "loss/policy_avg": 0.19203245639801025, + "lr": 9.92459100204499e-06, + "objective/entropy": 57.626686096191406, + "objective/kl": 30.407909393310547, + "objective/non_score_reward": -1.5203955173492432, + "objective/rlhf_reward": -1.6815817117691036, + "objective/scores": 1.1, + "policy/approxkl_avg": 324.47161865234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.544921875, + "step": 118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0000627040863037 + }, + { + "episode": 1920, + "epoch": 0.011503756695545889, + "loss/policy_avg": -0.02956828847527504, + "lr": 9.923951942740287e-06, + "objective/entropy": 160.20449829101562, + "objective/kl": 13.33430290222168, + "objective/non_score_reward": -0.666715145111084, + "objective/rlhf_reward": -2.666860580444336, + "objective/scores": 0.0, + "policy/approxkl_avg": 62.17939758300781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.587890625, + "step": 119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003843307495117 + }, + { + "episode": 1936, + "epoch": 0.011599621334675438, + "loss/policy_avg": 0.3666956424713135, + "lr": 9.923312883435584e-06, + "objective/entropy": 173.73385620117188, + "objective/kl": 25.82461929321289, + "objective/non_score_reward": -1.2912311553955078, + "objective/rlhf_reward": -7.164924621582031, + "objective/scores": -0.5, + "policy/approxkl_avg": 248.4417724609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.71484375, + "step": 120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001635551452637 + }, + { + "episode": 1952, + "epoch": 0.011695485973804987, + "loss/policy_avg": 0.07095953077077866, + "lr": 9.92267382413088e-06, + "objective/entropy": 60.89289855957031, + "objective/kl": 21.512653350830078, + "objective/non_score_reward": -1.0756325721740723, + "objective/rlhf_reward": -2.851932506175384, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 187.26104736328125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986932277679443 + }, + { + "episode": 1968, + "epoch": 0.011791350612934537, + "loss/policy_avg": 0.11872611939907074, + "lr": 9.922034764826178e-06, + "objective/entropy": -24.511760711669922, + "objective/kl": 22.253305435180664, + "objective/non_score_reward": -1.1126651763916016, + "objective/rlhf_reward": -4.450661063194275, + "objective/scores": 0.0, + "policy/approxkl_avg": 199.84397888183594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9950015544891357 + }, + { + "episode": 1984, + "epoch": 0.011887215252064086, + "loss/policy_avg": 0.5726426839828491, + "lr": 9.921395705521473e-06, + "objective/entropy": 102.35612487792969, + "objective/kl": 32.768287658691406, + "objective/non_score_reward": -1.6384142637252808, + "objective/rlhf_reward": -8.553656578063965, + "objective/scores": -0.5, + "policy/approxkl_avg": 327.3544921875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.533203125, + "step": 123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.995248556137085 + }, + { + "episode": 2000, + "epoch": 0.011983079891193635, + "loss/policy_avg": 0.19069992005825043, + "lr": 9.92075664621677e-06, + "objective/entropy": 7.145952224731445, + "objective/kl": 17.727392196655273, + "objective/non_score_reward": -0.8863697052001953, + "objective/rlhf_reward": -3.545478705316782, + "objective/scores": 0.0, + "policy/approxkl_avg": 26.600868225097656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0011134147644043 + }, + { + "episode": 2016, + "epoch": 0.012078944530323184, + "loss/policy_avg": 0.2572447657585144, + "lr": 9.920117586912067e-06, + "objective/entropy": 109.04229736328125, + "objective/kl": 17.79098129272461, + "objective/non_score_reward": -0.8895490765571594, + "objective/rlhf_reward": -5.558196067810059, + "objective/scores": -0.5, + "policy/approxkl_avg": 111.66732788085938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.763671875, + "step": 125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968197345733643 + }, + { + "episode": 2032, + "epoch": 0.012174809169452733, + "loss/policy_avg": 0.043444547802209854, + "lr": 9.919478527607362e-06, + "objective/entropy": 75.83810424804688, + "objective/kl": 27.20602798461914, + "objective/non_score_reward": -1.3603014945983887, + "objective/rlhf_reward": -2.51748690450308, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 274.793701171875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.521484375, + "step": 126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991501569747925 + }, + { + "episode": 2048, + "epoch": 0.012270673808582282, + "loss/policy_avg": 0.2138219177722931, + "lr": 9.918839468302659e-06, + "objective/entropy": 21.247840881347656, + "objective/kl": 14.299978256225586, + "objective/non_score_reward": -0.7149989008903503, + "objective/rlhf_reward": -2.8599955439567566, + "objective/scores": 0.0, + "policy/approxkl_avg": 21.416780471801758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.80078125, + "step": 127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998870849609375 + }, + { + "episode": 2064, + "epoch": 0.012366538447711831, + "loss/policy_avg": 0.23010344803333282, + "lr": 9.918200408997956e-06, + "objective/entropy": -76.91316223144531, + "objective/kl": 13.382017135620117, + "objective/non_score_reward": -0.6691007614135742, + "objective/rlhf_reward": -2.676403224468231, + "objective/scores": 0.0, + "policy/approxkl_avg": 37.060523986816406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.677734375, + "step": 128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998915195465088 + }, + { + "episode": 2080, + "epoch": 0.01246240308684138, + "loss/policy_avg": 0.4017820954322815, + "lr": 9.917561349693252e-06, + "objective/entropy": 198.82456970214844, + "objective/kl": 22.337753295898438, + "objective/non_score_reward": -1.1168878078460693, + "objective/rlhf_reward": -2.9112917771011144, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 41.49570846557617, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.59765625, + "step": 129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001622200012207 + }, + { + "episode": 2096, + "epoch": 0.012558267725970929, + "loss/policy_avg": 0.45664405822753906, + "lr": 9.91692229038855e-06, + "objective/entropy": 96.67610168457031, + "objective/kl": 13.830822944641113, + "objective/non_score_reward": -0.6915411353111267, + "objective/rlhf_reward": -2.7661644518375397, + "objective/scores": 0.0, + "policy/approxkl_avg": 49.5977783203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.87890625, + "step": 130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0018720626831055 + }, + { + "episode": 2112, + "epoch": 0.012654132365100478, + "loss/policy_avg": 0.18199189007282257, + "lr": 9.916283231083844e-06, + "objective/entropy": 12.164558410644531, + "objective/kl": 17.693878173828125, + "objective/non_score_reward": -0.8846939206123352, + "objective/rlhf_reward": -3.538775682449341, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.7435173988342285, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.62890625, + "step": 131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0015268325805664 + }, + { + "episode": 2128, + "epoch": 0.012749997004230027, + "loss/policy_avg": 0.21469825506210327, + "lr": 9.915644171779141e-06, + "objective/entropy": -11.569038391113281, + "objective/kl": 14.204147338867188, + "objective/non_score_reward": -0.7102073431015015, + "objective/rlhf_reward": -1.481579744552059, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 91.71839141845703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0028629302978516 + }, + { + "episode": 2144, + "epoch": 0.012845861643359576, + "loss/policy_avg": 0.27063143253326416, + "lr": 9.915005112474438e-06, + "objective/entropy": 180.4578857421875, + "objective/kl": 24.935741424560547, + "objective/non_score_reward": -1.2467870712280273, + "objective/rlhf_reward": -6.987148284912109, + "objective/scores": -0.5, + "policy/approxkl_avg": 138.678955078125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.529296875, + "step": 133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967637062072754 + }, + { + "episode": 2160, + "epoch": 0.012941726282489125, + "loss/policy_avg": 0.1394023448228836, + "lr": 9.914366053169735e-06, + "objective/entropy": -29.98552703857422, + "objective/kl": 13.385698318481445, + "objective/non_score_reward": -0.6692849397659302, + "objective/rlhf_reward": -2.6771397292613983, + "objective/scores": 0.0, + "policy/approxkl_avg": 73.47354125976562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6171875, + "step": 134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000580310821533 + }, + { + "episode": 2176, + "epoch": 0.013037590921618674, + "loss/policy_avg": 0.0048561920411884785, + "lr": 9.913726993865032e-06, + "objective/entropy": 88.89292907714844, + "objective/kl": 28.03160858154297, + "objective/non_score_reward": -1.4015804529190063, + "objective/rlhf_reward": -4.227719643203121, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 370.16766357421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.666015625, + "step": 135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.995047926902771 + }, + { + "episode": 2192, + "epoch": 0.013133455560748224, + "loss/policy_avg": 0.1565648913383484, + "lr": 9.913087934560329e-06, + "objective/entropy": 48.28108596801758, + "objective/kl": 22.514755249023438, + "objective/non_score_reward": -1.1257379055023193, + "objective/rlhf_reward": -0.10295168161392176, + "objective/scores": 1.1, + "policy/approxkl_avg": 272.63470458984375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.58203125, + "step": 136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0003504753112793 + }, + { + "episode": 2208, + "epoch": 0.013229320199877773, + "loss/policy_avg": 0.1350177526473999, + "lr": 9.912448875255624e-06, + "objective/entropy": 184.51797485351562, + "objective/kl": 30.795909881591797, + "objective/non_score_reward": -1.5397955179214478, + "objective/rlhf_reward": -8.159181594848633, + "objective/scores": -0.5, + "policy/approxkl_avg": 407.7762145996094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.578125, + "step": 137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997571349143982 + }, + { + "episode": 2224, + "epoch": 0.013325184839007322, + "loss/policy_avg": 0.2587956190109253, + "lr": 9.911809815950921e-06, + "objective/entropy": 14.785064697265625, + "objective/kl": 23.858671188354492, + "objective/non_score_reward": -1.1929335594177246, + "objective/rlhf_reward": -4.77173438668251, + "objective/scores": 0.0, + "policy/approxkl_avg": 258.4976501464844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58984375, + "step": 138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996340036392212 + }, + { + "episode": 2240, + "epoch": 0.013421049478136871, + "loss/policy_avg": 0.03932709991931915, + "lr": 9.911170756646218e-06, + "objective/entropy": -88.26953887939453, + "objective/kl": 11.428003311157227, + "objective/non_score_reward": -0.5714001655578613, + "objective/rlhf_reward": -4.285600662231445, + "objective/scores": -0.5, + "policy/approxkl_avg": 41.424224853515625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.775390625, + "step": 139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0010673999786377 + }, + { + "episode": 2256, + "epoch": 0.01351691411726642, + "loss/policy_avg": 0.20215287804603577, + "lr": 9.910531697341515e-06, + "objective/entropy": 66.65933227539062, + "objective/kl": 20.929710388183594, + "objective/non_score_reward": -1.046485424041748, + "objective/rlhf_reward": 0.2140582442283634, + "objective/scores": 1.1, + "policy/approxkl_avg": 124.57344055175781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7421875, + "step": 140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977262020111084 + }, + { + "episode": 2272, + "epoch": 0.01361277875639597, + "loss/policy_avg": 0.5699018836021423, + "lr": 9.909892638036812e-06, + "objective/entropy": 10.020034790039062, + "objective/kl": 16.74536895751953, + "objective/non_score_reward": -0.8372684717178345, + "objective/rlhf_reward": 1.0509260237216953, + "objective/scores": 1.1, + "policy/approxkl_avg": 193.53329467773438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.640625, + "step": 141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986698627471924 + }, + { + "episode": 2288, + "epoch": 0.013708643395525518, + "loss/policy_avg": 0.15781471133232117, + "lr": 9.909253578732107e-06, + "objective/entropy": -44.31187438964844, + "objective/kl": 30.742799758911133, + "objective/non_score_reward": -1.537139892578125, + "objective/rlhf_reward": -8.1485595703125, + "objective/scores": -0.5, + "policy/approxkl_avg": 158.5760498046875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.73828125, + "step": 142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987871646881104 + }, + { + "episode": 2304, + "epoch": 0.013804508034655067, + "loss/policy_avg": 0.09526471048593521, + "lr": 9.908614519427404e-06, + "objective/entropy": 52.471221923828125, + "objective/kl": 19.550655364990234, + "objective/non_score_reward": -0.9775327444076538, + "objective/rlhf_reward": -2.4595327778771967, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 176.07566833496094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.91796875, + "step": 143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970924854278564 + }, + { + "episode": 2320, + "epoch": 0.013900372673784616, + "loss/policy_avg": 0.03243420645594597, + "lr": 9.9079754601227e-06, + "objective/entropy": 128.66928100585938, + "objective/kl": 21.24932861328125, + "objective/non_score_reward": -1.0624663829803467, + "objective/rlhf_reward": -1.849865472316742, + "objective/scores": 0.6, + "policy/approxkl_avg": 188.10623168945312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.470703125, + "step": 144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996135234832764 + }, + { + "episode": 2336, + "epoch": 0.013996237312914165, + "loss/policy_avg": 0.25250673294067383, + "lr": 9.907336400817996e-06, + "objective/entropy": 198.611083984375, + "objective/kl": 21.650169372558594, + "objective/non_score_reward": -1.0825085639953613, + "objective/rlhf_reward": -6.330034255981445, + "objective/scores": -0.5, + "policy/approxkl_avg": 130.0052490234375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.892578125, + "step": 145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994676113128662 + }, + { + "episode": 2352, + "epoch": 0.014092101952043714, + "loss/policy_avg": -0.034668684005737305, + "lr": 9.906697341513293e-06, + "objective/entropy": -27.681907653808594, + "objective/kl": 26.334529876708984, + "objective/non_score_reward": -1.316726565361023, + "objective/rlhf_reward": -3.907656395171566, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 210.81857299804688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.552734375, + "step": 146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9972717761993408 + }, + { + "episode": 2368, + "epoch": 0.014187966591173263, + "loss/policy_avg": 0.12787118554115295, + "lr": 9.90605828220859e-06, + "objective/entropy": -43.486568450927734, + "objective/kl": 20.714540481567383, + "objective/non_score_reward": -1.0357270240783691, + "objective/rlhf_reward": -4.142907917499542, + "objective/scores": 0.0, + "policy/approxkl_avg": 86.59358978271484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.673828125, + "step": 147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999436378479004 + }, + { + "episode": 2384, + "epoch": 0.014283831230302812, + "loss/policy_avg": 0.03302329033613205, + "lr": 9.905419222903886e-06, + "objective/entropy": 173.50836181640625, + "objective/kl": 23.32859230041504, + "objective/non_score_reward": -1.1664297580718994, + "objective/rlhf_reward": -3.3402058220206925, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 224.7312774658203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.759765625, + "step": 148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986599683761597 + }, + { + "episode": 2400, + "epoch": 0.01437969586943236, + "loss/policy_avg": 0.04894339293241501, + "lr": 9.904780163599183e-06, + "objective/entropy": 48.957122802734375, + "objective/kl": 25.618064880371094, + "objective/non_score_reward": -1.2809032201766968, + "objective/rlhf_reward": -3.699780781467525, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 230.8973388671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.576171875, + "step": 149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9963643550872803 + }, + { + "episode": 2416, + "epoch": 0.01447556050856191, + "loss/policy_avg": 0.36329030990600586, + "lr": 9.904141104294478e-06, + "objective/entropy": 157.50445556640625, + "objective/kl": 23.93838882446289, + "objective/non_score_reward": -1.1969194412231445, + "objective/rlhf_reward": -4.787678003311157, + "objective/scores": 0.0, + "policy/approxkl_avg": 200.30621337890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9953033924102783 + }, + { + "episode": 2432, + "epoch": 0.01457142514769146, + "loss/policy_avg": 0.3649589419364929, + "lr": 9.903502044989775e-06, + "objective/entropy": 213.43943786621094, + "objective/kl": 19.777463912963867, + "objective/non_score_reward": -0.9888731837272644, + "objective/rlhf_reward": -2.6138571410471494, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 81.07418060302734, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.677734375, + "step": 151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006325244903564 + }, + { + "episode": 2448, + "epoch": 0.01466728978682101, + "loss/policy_avg": 0.35868164896965027, + "lr": 9.902862985685072e-06, + "objective/entropy": 2.8281936645507812, + "objective/kl": 23.836688995361328, + "objective/non_score_reward": -1.191834568977356, + "objective/rlhf_reward": -6.767337799072266, + "objective/scores": -0.5, + "policy/approxkl_avg": 91.45345306396484, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5703125, + "step": 152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9970684051513672 + }, + { + "episode": 2464, + "epoch": 0.014763154425950558, + "loss/policy_avg": 0.07516692578792572, + "lr": 9.902223926380369e-06, + "objective/entropy": 171.71034240722656, + "objective/kl": 21.95725440979004, + "objective/non_score_reward": -1.097862720489502, + "objective/rlhf_reward": -4.3914510905742645, + "objective/scores": 0.0, + "policy/approxkl_avg": 221.61212158203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973832368850708 + }, + { + "episode": 2480, + "epoch": 0.014859019065080107, + "loss/policy_avg": 0.01844581961631775, + "lr": 9.901584867075666e-06, + "objective/entropy": 64.6440200805664, + "objective/kl": 17.436233520507812, + "objective/non_score_reward": -0.8718117475509644, + "objective/rlhf_reward": -5.487246990203857, + "objective/scores": -0.5, + "policy/approxkl_avg": 70.52445983886719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.755859375, + "step": 154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000408172607422 + }, + { + "episode": 2496, + "epoch": 0.014954883704209656, + "loss/policy_avg": 0.05899505689740181, + "lr": 9.900945807770961e-06, + "objective/entropy": 123.7980728149414, + "objective/kl": 25.07213592529297, + "objective/non_score_reward": -1.253606915473938, + "objective/rlhf_reward": -7.014427661895752, + "objective/scores": -0.5, + "policy/approxkl_avg": 88.28120422363281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.51171875, + "step": 155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000736951828003 + }, + { + "episode": 2512, + "epoch": 0.015050748343339205, + "loss/policy_avg": 0.4545804560184479, + "lr": 9.900306748466258e-06, + "objective/entropy": 10.871131896972656, + "objective/kl": 26.028505325317383, + "objective/non_score_reward": -1.3014252185821533, + "objective/rlhf_reward": -5.205701023340225, + "objective/scores": 0.0, + "policy/approxkl_avg": 217.84939575195312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5859375, + "step": 156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983347654342651 + }, + { + "episode": 2528, + "epoch": 0.015146612982468754, + "loss/policy_avg": 0.4191577136516571, + "lr": 9.899667689161555e-06, + "objective/entropy": 109.52301025390625, + "objective/kl": 27.344154357910156, + "objective/non_score_reward": -1.3672077655792236, + "objective/rlhf_reward": -7.4688310623168945, + "objective/scores": -0.5, + "policy/approxkl_avg": 89.15927124023438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999230146408081 + }, + { + "episode": 2544, + "epoch": 0.015242477621598303, + "loss/policy_avg": 0.1439390629529953, + "lr": 9.899028629856852e-06, + "objective/entropy": 220.28952026367188, + "objective/kl": 19.178768157958984, + "objective/non_score_reward": -0.9589384198188782, + "objective/rlhf_reward": -3.8357537388801575, + "objective/scores": 0.0, + "policy/approxkl_avg": 71.46617126464844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8828125, + "step": 158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997718334197998 + }, + { + "episode": 2560, + "epoch": 0.015338342260727852, + "loss/policy_avg": 0.30983591079711914, + "lr": 9.898389570552149e-06, + "objective/entropy": 104.64752197265625, + "objective/kl": 27.657455444335938, + "objective/non_score_reward": -1.3828728199005127, + "objective/rlhf_reward": -3.7981575886408487, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 138.6593017578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.560546875, + "step": 159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977858066558838 + }, + { + "episode": 2576, + "epoch": 0.015434206899857401, + "loss/policy_avg": 0.07094208896160126, + "lr": 9.897750511247446e-06, + "objective/entropy": 90.89056396484375, + "objective/kl": 21.30394744873047, + "objective/non_score_reward": -1.065197467803955, + "objective/rlhf_reward": -4.260790050029755, + "objective/scores": 0.0, + "policy/approxkl_avg": 297.8140563964844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.80859375, + "step": 160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0023467540740967 + }, + { + "episode": 2592, + "epoch": 0.01553007153898695, + "loss/policy_avg": 0.16327880322933197, + "lr": 9.89711145194274e-06, + "objective/entropy": 100.65301513671875, + "objective/kl": 24.78557586669922, + "objective/non_score_reward": -1.239278793334961, + "objective/rlhf_reward": -3.1322867229309788, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 208.9399871826172, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.625, + "step": 161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0041985511779785 + }, + { + "episode": 2608, + "epoch": 0.0156259361781165, + "loss/policy_avg": 0.216099351644516, + "lr": 9.896472392638038e-06, + "objective/entropy": 57.33685302734375, + "objective/kl": 29.304649353027344, + "objective/non_score_reward": -1.4652326107025146, + "objective/rlhf_reward": -7.8609299659729, + "objective/scores": -0.5, + "policy/approxkl_avg": 207.6790313720703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.833984375, + "step": 162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980382919311523 + }, + { + "episode": 2624, + "epoch": 0.01572180081724605, + "loss/policy_avg": 0.0953613817691803, + "lr": 9.895833333333334e-06, + "objective/entropy": -133.8697967529297, + "objective/kl": 16.73604965209961, + "objective/non_score_reward": -0.83680260181427, + "objective/rlhf_reward": -3.347210466861725, + "objective/scores": 0.0, + "policy/approxkl_avg": 145.06759643554688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.802734375, + "step": 163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988582134246826 + }, + { + "episode": 2640, + "epoch": 0.0158176654563756, + "loss/policy_avg": 0.42890581488609314, + "lr": 9.895194274028631e-06, + "objective/entropy": 180.38433837890625, + "objective/kl": 25.974315643310547, + "objective/non_score_reward": -1.2987157106399536, + "objective/rlhf_reward": -7.1948628425598145, + "objective/scores": -0.5, + "policy/approxkl_avg": 126.24812316894531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73046875, + "step": 164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987969398498535 + }, + { + "episode": 2656, + "epoch": 0.015913530095505148, + "loss/policy_avg": 0.1958284080028534, + "lr": 9.894555214723928e-06, + "objective/entropy": 174.94032287597656, + "objective/kl": 25.64311981201172, + "objective/non_score_reward": -1.282155990600586, + "objective/rlhf_reward": -3.786988249331146, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 86.50934600830078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.705078125, + "step": 165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9944665431976318 + }, + { + "episode": 2672, + "epoch": 0.016009394734634697, + "loss/policy_avg": 0.3368389904499054, + "lr": 9.893916155419225e-06, + "objective/entropy": 10.10284423828125, + "objective/kl": 24.560279846191406, + "objective/non_score_reward": -1.2280139923095703, + "objective/rlhf_reward": -6.912055969238281, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.86394500732422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.720703125, + "step": 166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986299276351929 + }, + { + "episode": 2688, + "epoch": 0.016105259373764245, + "loss/policy_avg": 0.11198948323726654, + "lr": 9.89327709611452e-06, + "objective/entropy": 161.62661743164062, + "objective/kl": 39.22645568847656, + "objective/non_score_reward": -1.9613227844238281, + "objective/rlhf_reward": -7.845290899276733, + "objective/scores": 0.0, + "policy/approxkl_avg": 164.2472381591797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.841796875, + "step": 167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000530958175659 + }, + { + "episode": 2704, + "epoch": 0.016201124012893794, + "loss/policy_avg": 0.19894596934318542, + "lr": 9.892638036809815e-06, + "objective/entropy": 137.1651153564453, + "objective/kl": 24.861934661865234, + "objective/non_score_reward": -1.2430968284606934, + "objective/rlhf_reward": -3.147558684619974, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 133.68283081054688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.66796875, + "step": 168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997298240661621 + }, + { + "episode": 2720, + "epoch": 0.016296988652023343, + "loss/policy_avg": 0.12182526290416718, + "lr": 9.891998977505112e-06, + "objective/entropy": 18.006725311279297, + "objective/kl": 17.92361068725586, + "objective/non_score_reward": -0.8961805701255798, + "objective/rlhf_reward": -2.134124155254707, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 192.45278930664062, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.583984375, + "step": 169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0006425380706787 + }, + { + "episode": 2736, + "epoch": 0.016392853291152892, + "loss/policy_avg": -0.012196972966194153, + "lr": 9.89135991820041e-06, + "objective/entropy": 115.16173553466797, + "objective/kl": 21.65146827697754, + "objective/non_score_reward": -1.082573413848877, + "objective/rlhf_reward": -2.5054651006785145, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 145.09487915039062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.71484375, + "step": 170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983795881271362 + }, + { + "episode": 2752, + "epoch": 0.01648871793028244, + "loss/policy_avg": 0.5326859951019287, + "lr": 9.890720858895706e-06, + "objective/entropy": 95.13655853271484, + "objective/kl": 24.00056266784668, + "objective/non_score_reward": -1.2000280618667603, + "objective/rlhf_reward": -4.800112426280975, + "objective/scores": 0.0, + "policy/approxkl_avg": 119.50138854980469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006933212280273 + }, + { + "episode": 2768, + "epoch": 0.01658458256941199, + "loss/policy_avg": 0.3668867349624634, + "lr": 9.890081799591003e-06, + "objective/entropy": 132.36126708984375, + "objective/kl": 21.386262893676758, + "objective/non_score_reward": -1.0693132877349854, + "objective/rlhf_reward": -1.8772529125213624, + "objective/scores": 0.6, + "policy/approxkl_avg": 48.470794677734375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.666015625, + "step": 172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9946337938308716 + }, + { + "episode": 2784, + "epoch": 0.01668044720854154, + "loss/policy_avg": 0.460104763507843, + "lr": 9.8894427402863e-06, + "objective/entropy": 129.8038330078125, + "objective/kl": 25.860858917236328, + "objective/non_score_reward": -1.2930430173873901, + "objective/rlhf_reward": -3.656400167735752, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 146.58050537109375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5625, + "step": 173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9949872493743896 + }, + { + "episode": 2800, + "epoch": 0.016776311847671088, + "loss/policy_avg": 0.11980315297842026, + "lr": 9.888803680981595e-06, + "objective/entropy": 173.85202026367188, + "objective/kl": 23.159679412841797, + "objective/non_score_reward": -1.1579840183258057, + "objective/rlhf_reward": -6.631936073303223, + "objective/scores": -0.5, + "policy/approxkl_avg": 84.56037902832031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7734375, + "step": 174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9972279071807861 + }, + { + "episode": 2816, + "epoch": 0.016872176486800637, + "loss/policy_avg": 0.060305699706077576, + "lr": 9.888164621676892e-06, + "objective/entropy": 90.63494110107422, + "objective/kl": 23.04631805419922, + "objective/non_score_reward": -1.1523159742355347, + "objective/rlhf_reward": -4.609263688325882, + "objective/scores": 0.0, + "policy/approxkl_avg": 204.8768310546875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.564453125, + "step": 175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992055892944336 + }, + { + "episode": 2832, + "epoch": 0.016968041125930186, + "loss/policy_avg": 0.5130124092102051, + "lr": 9.887525562372189e-06, + "objective/entropy": 66.25984191894531, + "objective/kl": 32.70683288574219, + "objective/non_score_reward": -1.635341763496399, + "objective/rlhf_reward": -6.541367173194885, + "objective/scores": 0.0, + "policy/approxkl_avg": 265.0827941894531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.609375, + "step": 176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996222734451294 + }, + { + "episode": 2848, + "epoch": 0.017063905765059735, + "loss/policy_avg": 0.008577877655625343, + "lr": 9.886886503067486e-06, + "objective/entropy": -118.17359924316406, + "objective/kl": 21.02519989013672, + "objective/non_score_reward": -1.051259994506836, + "objective/rlhf_reward": -4.205039799213409, + "objective/scores": 0.0, + "policy/approxkl_avg": 260.4126892089844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.66015625, + "step": 177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000894069671631 + }, + { + "episode": 2864, + "epoch": 0.017159770404189284, + "loss/policy_avg": 0.18238189816474915, + "lr": 9.886247443762783e-06, + "objective/entropy": 130.0546875, + "objective/kl": 32.371009826660156, + "objective/non_score_reward": -1.6185506582260132, + "objective/rlhf_reward": -8.474203109741211, + "objective/scores": -0.5, + "policy/approxkl_avg": 228.2266387939453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7890625, + "step": 178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9942381381988525 + }, + { + "episode": 2880, + "epoch": 0.017255635043318833, + "loss/policy_avg": 0.18286140263080597, + "lr": 9.88560838445808e-06, + "objective/entropy": -10.639881134033203, + "objective/kl": 29.253890991210938, + "objective/non_score_reward": -1.462694525718689, + "objective/rlhf_reward": -7.850778102874756, + "objective/scores": -0.5, + "policy/approxkl_avg": 148.62832641601562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.798828125, + "step": 179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988160133361816 + }, + { + "episode": 2896, + "epoch": 0.017351499682448382, + "loss/policy_avg": 0.07891340553760529, + "lr": 9.884969325153375e-06, + "objective/entropy": -120.97007751464844, + "objective/kl": 21.97601890563965, + "objective/non_score_reward": -1.0988008975982666, + "objective/rlhf_reward": -2.2724974177041393, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 200.60455322265625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.533203125, + "step": 180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000365972518921 + }, + { + "episode": 2912, + "epoch": 0.01744736432157793, + "loss/policy_avg": 0.06744587421417236, + "lr": 9.884330265848671e-06, + "objective/entropy": 73.97216796875, + "objective/kl": 19.66523551940918, + "objective/non_score_reward": -0.9832619428634644, + "objective/rlhf_reward": -5.933047771453857, + "objective/scores": -0.5, + "policy/approxkl_avg": 168.08172607421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.603515625, + "step": 181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997870683670044 + }, + { + "episode": 2928, + "epoch": 0.01754322896070748, + "loss/policy_avg": 0.12424597889184952, + "lr": 9.883691206543968e-06, + "objective/entropy": 77.82262420654297, + "objective/kl": 21.0150146484375, + "objective/non_score_reward": -1.050750732421875, + "objective/rlhf_reward": 0.1969969511032108, + "objective/scores": 1.1, + "policy/approxkl_avg": 109.60333251953125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.693359375, + "step": 182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996559500694275 + }, + { + "episode": 2944, + "epoch": 0.01763909359983703, + "loss/policy_avg": 0.2606327533721924, + "lr": 9.883052147239265e-06, + "objective/entropy": 172.60293579101562, + "objective/kl": 29.473426818847656, + "objective/non_score_reward": -1.4736714363098145, + "objective/rlhf_reward": -5.894685626029968, + "objective/scores": 0.0, + "policy/approxkl_avg": 194.51976013183594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.818359375, + "step": 183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9962568283081055 + }, + { + "episode": 2960, + "epoch": 0.017734958238966578, + "loss/policy_avg": 0.10910254716873169, + "lr": 9.882413087934562e-06, + "objective/entropy": 240.20162963867188, + "objective/kl": 15.176373481750488, + "objective/non_score_reward": -0.7588187456130981, + "objective/rlhf_reward": 1.3647250771522526, + "objective/scores": 1.1, + "policy/approxkl_avg": 43.88645935058594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8203125, + "step": 184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006511211395264 + }, + { + "episode": 2976, + "epoch": 0.017830822878096127, + "loss/policy_avg": 0.5547807812690735, + "lr": 9.881774028629857e-06, + "objective/entropy": 85.18072509765625, + "objective/kl": 21.537092208862305, + "objective/non_score_reward": -1.0768545866012573, + "objective/rlhf_reward": -6.307418346405029, + "objective/scores": -0.5, + "policy/approxkl_avg": 161.64654541015625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.75390625, + "step": 185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996278285980225 + }, + { + "episode": 2992, + "epoch": 0.017926687517225676, + "loss/policy_avg": 0.3181283175945282, + "lr": 9.881134969325154e-06, + "objective/entropy": 129.28257751464844, + "objective/kl": 28.59075927734375, + "objective/non_score_reward": -1.4295378923416138, + "objective/rlhf_reward": -5.718151569366455, + "objective/scores": 0.0, + "policy/approxkl_avg": 53.536468505859375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998319149017334 + }, + { + "episode": 3008, + "epoch": 0.018022552156355228, + "loss/policy_avg": 0.37268152832984924, + "lr": 9.880495910020451e-06, + "objective/entropy": 183.25308227539062, + "objective/kl": 29.692989349365234, + "objective/non_score_reward": -1.484649419784546, + "objective/rlhf_reward": -7.938598155975342, + "objective/scores": -0.5, + "policy/approxkl_avg": 91.56600952148438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.599609375, + "step": 187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983760118484497 + }, + { + "episode": 3024, + "epoch": 0.018118416795484777, + "loss/policy_avg": 0.7035294771194458, + "lr": 9.879856850715748e-06, + "objective/entropy": -141.20687866210938, + "objective/kl": 16.28227996826172, + "objective/non_score_reward": -0.8141138553619385, + "objective/rlhf_reward": -5.256455421447754, + "objective/scores": -0.5, + "policy/approxkl_avg": 39.17454528808594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.912109375, + "step": 188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000697612762451 + }, + { + "episode": 3040, + "epoch": 0.018214281434614326, + "loss/policy_avg": 0.23234151303768158, + "lr": 9.879217791411043e-06, + "objective/entropy": 41.34138107299805, + "objective/kl": 27.16008758544922, + "objective/non_score_reward": -1.3580043315887451, + "objective/rlhf_reward": -4.05341557511459, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 271.9233093261719, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.51171875, + "step": 189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999565839767456 + }, + { + "episode": 3056, + "epoch": 0.018310146073743875, + "loss/policy_avg": 0.07211380451917648, + "lr": 9.87857873210634e-06, + "objective/entropy": 20.789365768432617, + "objective/kl": 19.183855056762695, + "objective/non_score_reward": -0.9591927528381348, + "objective/rlhf_reward": -5.836771011352539, + "objective/scores": -0.5, + "policy/approxkl_avg": 108.85220336914062, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6328125, + "step": 190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0007762908935547 + }, + { + "episode": 3072, + "epoch": 0.018406010712873424, + "loss/policy_avg": 0.7360332012176514, + "lr": 9.877939672801637e-06, + "objective/entropy": 219.01002502441406, + "objective/kl": 30.353984832763672, + "objective/non_score_reward": -1.5176992416381836, + "objective/rlhf_reward": -4.692195155707699, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 60.275230407714844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73046875, + "step": 191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9968408346176147 + }, + { + "episode": 3088, + "epoch": 0.018501875352002973, + "loss/policy_avg": 0.7660672664642334, + "lr": 9.877300613496934e-06, + "objective/entropy": 192.5721435546875, + "objective/kl": 18.974138259887695, + "objective/non_score_reward": -0.948706865310669, + "objective/rlhf_reward": -5.794827461242676, + "objective/scores": -0.5, + "policy/approxkl_avg": 208.0977783203125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.568359375, + "step": 192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996788740158081 + }, + { + "episode": 3104, + "epoch": 0.018597739991132522, + "loss/policy_avg": 0.4530583620071411, + "lr": 9.876661554192229e-06, + "objective/entropy": 185.2235107421875, + "objective/kl": 24.102296829223633, + "objective/non_score_reward": -1.2051149606704712, + "objective/rlhf_reward": -0.4204598426818844, + "objective/scores": 1.1, + "policy/approxkl_avg": 34.94757080078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.728515625, + "step": 193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9967865943908691 + }, + { + "episode": 3120, + "epoch": 0.01869360463026207, + "loss/policy_avg": 0.03487266227602959, + "lr": 9.876022494887526e-06, + "objective/entropy": 183.6469268798828, + "objective/kl": 20.133817672729492, + "objective/non_score_reward": -1.0066908597946167, + "objective/rlhf_reward": -4.026763558387756, + "objective/scores": 0.0, + "policy/approxkl_avg": 168.301025390625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7578125, + "step": 194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0027122497558594 + }, + { + "episode": 3136, + "epoch": 0.01878946926939162, + "loss/policy_avg": -0.029073666781187057, + "lr": 9.875383435582823e-06, + "objective/entropy": 138.25656127929688, + "objective/kl": 18.322010040283203, + "objective/non_score_reward": -0.9161005020141602, + "objective/rlhf_reward": -5.664402008056641, + "objective/scores": -0.5, + "policy/approxkl_avg": 60.4761848449707, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.673828125, + "step": 195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991329908370972 + }, + { + "episode": 3152, + "epoch": 0.01888533390852117, + "loss/policy_avg": 0.14693962037563324, + "lr": 9.87474437627812e-06, + "objective/entropy": 71.7930908203125, + "objective/kl": 19.49433135986328, + "objective/non_score_reward": -0.9747166633605957, + "objective/rlhf_reward": -3.8988667130470276, + "objective/scores": 0.0, + "policy/approxkl_avg": 126.81082153320312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.767578125, + "step": 196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977989196777344 + }, + { + "episode": 3168, + "epoch": 0.018981198547650718, + "loss/policy_avg": 0.6557031869888306, + "lr": 9.874105316973416e-06, + "objective/entropy": -9.503684997558594, + "objective/kl": 21.540775299072266, + "objective/non_score_reward": -1.0770388841629028, + "objective/rlhf_reward": -2.9295533085740626, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 100.91127014160156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.619140625, + "step": 197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9948209524154663 + }, + { + "episode": 3184, + "epoch": 0.019077063186780267, + "loss/policy_avg": 0.23461255431175232, + "lr": 9.873466257668712e-06, + "objective/entropy": -49.81024932861328, + "objective/kl": 20.112146377563477, + "objective/non_score_reward": -1.0056073665618896, + "objective/rlhf_reward": -6.0224289894104, + "objective/scores": -0.5, + "policy/approxkl_avg": 135.8631134033203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58203125, + "step": 198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999420404434204 + }, + { + "episode": 3200, + "epoch": 0.019172927825909816, + "loss/policy_avg": -0.5007312297821045, + "lr": 9.872827198364009e-06, + "objective/entropy": -25.718414306640625, + "objective/kl": 15.317103385925293, + "objective/non_score_reward": -0.7658551931381226, + "objective/rlhf_reward": -5.06342077255249, + "objective/scores": -0.5, + "policy/approxkl_avg": 154.1348876953125, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.796875, + "step": 199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.01686954498291 + }, + { + "episode": 3216, + "epoch": 0.019268792465039365, + "loss/policy_avg": -0.08025580644607544, + "lr": 9.872188139059305e-06, + "objective/entropy": 205.03077697753906, + "objective/kl": 21.50469207763672, + "objective/non_score_reward": -1.0752345323562622, + "objective/rlhf_reward": 0.09906184077262914, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.594398498535156, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.67578125, + "step": 200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998137354850769 + }, + { + "episode": 3232, + "epoch": 0.019364657104168913, + "loss/policy_avg": 0.09840521216392517, + "lr": 9.871549079754602e-06, + "objective/entropy": 94.7195053100586, + "objective/kl": 23.290691375732422, + "objective/non_score_reward": -1.164534568786621, + "objective/rlhf_reward": -6.658138275146484, + "objective/scores": -0.5, + "policy/approxkl_avg": 61.80426788330078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.568359375, + "step": 201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993457794189453 + }, + { + "episode": 3248, + "epoch": 0.019460521743298462, + "loss/policy_avg": 0.20220378041267395, + "lr": 9.8709100204499e-06, + "objective/entropy": 138.21493530273438, + "objective/kl": 26.57837677001953, + "objective/non_score_reward": -1.328918695449829, + "objective/rlhf_reward": -7.315675258636475, + "objective/scores": -0.5, + "policy/approxkl_avg": 125.10806274414062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978711605072021 + }, + { + "episode": 3264, + "epoch": 0.01955638638242801, + "loss/policy_avg": 0.20998699963092804, + "lr": 9.870270961145196e-06, + "objective/entropy": 183.40069580078125, + "objective/kl": 22.997217178344727, + "objective/non_score_reward": -1.1498608589172363, + "objective/rlhf_reward": -4.599443554878235, + "objective/scores": 0.0, + "policy/approxkl_avg": 99.0360336303711, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.599609375, + "step": 203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988300800323486 + }, + { + "episode": 3280, + "epoch": 0.01965225102155756, + "loss/policy_avg": 0.11741530895233154, + "lr": 9.869631901840491e-06, + "objective/entropy": 10.714279174804688, + "objective/kl": 29.581377029418945, + "objective/non_score_reward": -1.4790689945220947, + "objective/rlhf_reward": -4.435322943146586, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 252.8714599609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.76953125, + "step": 204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9974395036697388 + }, + { + "episode": 3296, + "epoch": 0.01974811566068711, + "loss/policy_avg": 0.28326037526130676, + "lr": 9.868992842535788e-06, + "objective/entropy": 115.94627380371094, + "objective/kl": 29.00347137451172, + "objective/non_score_reward": -1.4501736164093018, + "objective/rlhf_reward": -5.800694525241852, + "objective/scores": 0.0, + "policy/approxkl_avg": 219.1302032470703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.853515625, + "step": 205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988524913787842 + }, + { + "episode": 3312, + "epoch": 0.019843980299816658, + "loss/policy_avg": 0.11076626181602478, + "lr": 9.868353783231085e-06, + "objective/entropy": 146.21900939941406, + "objective/kl": 33.56465530395508, + "objective/non_score_reward": -1.6782327890396118, + "objective/rlhf_reward": -6.712931394577026, + "objective/scores": 0.0, + "policy/approxkl_avg": 157.33514404296875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.908203125, + "step": 206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998185157775879 + }, + { + "episode": 3328, + "epoch": 0.019939844938946207, + "loss/policy_avg": -0.04051626846194267, + "lr": 9.867714723926382e-06, + "objective/entropy": 80.3193130493164, + "objective/kl": 31.786861419677734, + "objective/non_score_reward": -1.5893430709838867, + "objective/rlhf_reward": -3.433653388859007, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 220.55699157714844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.52734375, + "step": 207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982627630233765 + }, + { + "episode": 3344, + "epoch": 0.020035709578075756, + "loss/policy_avg": 0.0633954256772995, + "lr": 9.867075664621679e-06, + "objective/entropy": 167.83624267578125, + "objective/kl": 28.54816246032715, + "objective/non_score_reward": -1.427408218383789, + "objective/rlhf_reward": -7.709632873535156, + "objective/scores": -0.5, + "policy/approxkl_avg": 161.32330322265625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.77734375, + "step": 208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0011916160583496 + }, + { + "episode": 3360, + "epoch": 0.020131574217205305, + "loss/policy_avg": 0.4134795665740967, + "lr": 9.866436605316974e-06, + "objective/entropy": 190.93576049804688, + "objective/kl": 31.057029724121094, + "objective/non_score_reward": -1.5528514385223389, + "objective/rlhf_reward": -4.852156007026119, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 412.91265869140625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.833984375, + "step": 209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9969439506530762 + }, + { + "episode": 3376, + "epoch": 0.020227438856334854, + "loss/policy_avg": 0.011405892670154572, + "lr": 9.86579754601227e-06, + "objective/entropy": -0.3257408142089844, + "objective/kl": 30.285137176513672, + "objective/non_score_reward": -1.5142569541931152, + "objective/rlhf_reward": -6.057027459144592, + "objective/scores": 0.0, + "policy/approxkl_avg": 255.63124084472656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.716796875, + "step": 210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983439445495605 + }, + { + "episode": 3392, + "epoch": 0.020323303495464403, + "loss/policy_avg": 0.02998751401901245, + "lr": 9.865158486707568e-06, + "objective/entropy": 18.811004638671875, + "objective/kl": 26.8281307220459, + "objective/non_score_reward": -1.3414065837860107, + "objective/rlhf_reward": -2.4419072612535686, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 244.22311401367188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.79296875, + "step": 211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999415636062622 + }, + { + "episode": 3408, + "epoch": 0.020419168134593952, + "loss/policy_avg": 0.3882741332054138, + "lr": 9.864519427402863e-06, + "objective/entropy": 151.1260223388672, + "objective/kl": 34.16276550292969, + "objective/non_score_reward": -1.708138108253479, + "objective/rlhf_reward": -5.170693164289581, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 124.56742858886719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9970864057540894 + }, + { + "episode": 3424, + "epoch": 0.0205150327737235, + "loss/policy_avg": 0.2528703212738037, + "lr": 9.86388036809816e-06, + "objective/entropy": 71.80561065673828, + "objective/kl": 30.099973678588867, + "objective/non_score_reward": -1.5049986839294434, + "objective/rlhf_reward": -8.019994735717773, + "objective/scores": -0.5, + "policy/approxkl_avg": 310.709716796875, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.748046875, + "step": 213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997683048248291 + }, + { + "episode": 3440, + "epoch": 0.02061089741285305, + "loss/policy_avg": -0.011442364193499088, + "lr": 9.863241308793457e-06, + "objective/entropy": 148.97042846679688, + "objective/kl": 25.652734756469727, + "objective/non_score_reward": -1.2826368808746338, + "objective/rlhf_reward": -2.7305472850799557, + "objective/scores": 0.6, + "policy/approxkl_avg": 60.726966857910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.640625, + "step": 214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99959135055542 + }, + { + "episode": 3456, + "epoch": 0.0207067620519826, + "loss/policy_avg": 0.30335062742233276, + "lr": 9.862602249488753e-06, + "objective/entropy": 64.34867858886719, + "objective/kl": 26.22498321533203, + "objective/non_score_reward": -1.3112492561340332, + "objective/rlhf_reward": -7.244997024536133, + "objective/scores": -0.5, + "policy/approxkl_avg": 180.33152770996094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.716796875, + "step": 215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99501633644104 + }, + { + "episode": 3472, + "epoch": 0.02080262669111215, + "loss/policy_avg": 0.2380252331495285, + "lr": 9.86196319018405e-06, + "objective/entropy": -28.018264770507812, + "objective/kl": 25.426055908203125, + "objective/non_score_reward": -1.2713027000427246, + "objective/rlhf_reward": -5.0852110385894775, + "objective/scores": 0.0, + "policy/approxkl_avg": 168.28411865234375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.623046875, + "step": 216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996917486190796 + }, + { + "episode": 3488, + "epoch": 0.0208984913302417, + "loss/policy_avg": 0.35980474948883057, + "lr": 9.861324130879346e-06, + "objective/entropy": -56.049781799316406, + "objective/kl": 18.423480987548828, + "objective/non_score_reward": -0.9211740493774414, + "objective/rlhf_reward": -3.6846961677074432, + "objective/scores": 0.0, + "policy/approxkl_avg": 23.263450622558594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.619140625, + "step": 217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986913204193115 + }, + { + "episode": 3504, + "epoch": 0.02099435596937125, + "loss/policy_avg": 0.11377542465925217, + "lr": 9.860685071574642e-06, + "objective/entropy": 51.95567321777344, + "objective/kl": 30.016387939453125, + "objective/non_score_reward": -1.500819444656372, + "objective/rlhf_reward": -8.003277778625488, + "objective/scores": -0.5, + "policy/approxkl_avg": 219.76956176757812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.673828125, + "step": 218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001506805419922 + }, + { + "episode": 3520, + "epoch": 0.021090220608500798, + "loss/policy_avg": 0.11929692327976227, + "lr": 9.86004601226994e-06, + "objective/entropy": 54.36243438720703, + "objective/kl": 24.446704864501953, + "objective/non_score_reward": -1.2223353385925293, + "objective/rlhf_reward": -4.8893409967422485, + "objective/scores": 0.0, + "policy/approxkl_avg": 19.67999267578125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.587890625, + "step": 219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0119833946228027 + }, + { + "episode": 3536, + "epoch": 0.021186085247630347, + "loss/policy_avg": 1.1937235593795776, + "lr": 9.859406952965236e-06, + "objective/entropy": -8.128410339355469, + "objective/kl": 29.450044631958008, + "objective/non_score_reward": -1.4725021123886108, + "objective/rlhf_reward": -4.065179969343255, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 190.29078674316406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001643180847168 + }, + { + "episode": 3552, + "epoch": 0.021281949886759896, + "loss/policy_avg": 0.48739299178123474, + "lr": 9.858767893660533e-06, + "objective/entropy": 96.00523376464844, + "objective/kl": 25.66995620727539, + "objective/non_score_reward": -1.2834978103637695, + "objective/rlhf_reward": -3.8084781503974625, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 83.90371704101562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.44921875, + "step": 221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984304904937744 + }, + { + "episode": 3568, + "epoch": 0.021377814525889445, + "loss/policy_avg": 0.4375818967819214, + "lr": 9.858128834355828e-06, + "objective/entropy": 75.25170135498047, + "objective/kl": 31.37661361694336, + "objective/non_score_reward": -1.5688308477401733, + "objective/rlhf_reward": -4.949810538321657, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 113.29835510253906, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.515625, + "step": 222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974620342254639 + }, + { + "episode": 3584, + "epoch": 0.021473679165018994, + "loss/policy_avg": 0.18842488527297974, + "lr": 9.857489775051125e-06, + "objective/entropy": 62.053443908691406, + "objective/kl": 30.84737777709961, + "objective/non_score_reward": -1.5423686504364014, + "objective/rlhf_reward": -6.16947478055954, + "objective/scores": 0.0, + "policy/approxkl_avg": 170.18569946289062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.58984375, + "step": 223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998410701751709 + }, + { + "episode": 3600, + "epoch": 0.021569543804148543, + "loss/policy_avg": 0.3384511470794678, + "lr": 9.856850715746422e-06, + "objective/entropy": 80.18526458740234, + "objective/kl": 23.6530818939209, + "objective/non_score_reward": -1.1826542615890503, + "objective/rlhf_reward": -4.730616986751556, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6079671382904053, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994169473648071 + }, + { + "episode": 3616, + "epoch": 0.021665408443278092, + "loss/policy_avg": 0.29894721508026123, + "lr": 9.856211656441719e-06, + "objective/entropy": 165.5200958251953, + "objective/kl": 18.515085220336914, + "objective/non_score_reward": -0.9257543087005615, + "objective/rlhf_reward": -3.703017294406891, + "objective/scores": 0.0, + "policy/approxkl_avg": 74.32862854003906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.70703125, + "step": 225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9962903261184692 + }, + { + "episode": 3632, + "epoch": 0.02176127308240764, + "loss/policy_avg": 0.46412864327430725, + "lr": 9.855572597137016e-06, + "objective/entropy": 89.5113525390625, + "objective/kl": 29.43311882019043, + "objective/non_score_reward": -1.4716558456420898, + "objective/rlhf_reward": -7.886623382568359, + "objective/scores": -0.5, + "policy/approxkl_avg": 100.8701171875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69140625, + "step": 226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992918968200684 + }, + { + "episode": 3648, + "epoch": 0.02185713772153719, + "loss/policy_avg": -0.17664140462875366, + "lr": 9.854933537832313e-06, + "objective/entropy": -5.129295349121094, + "objective/kl": 16.053768157958984, + "objective/non_score_reward": -0.802688479423523, + "objective/rlhf_reward": -5.210753917694092, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.066779136657715, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.525390625, + "step": 227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001509189605713 + }, + { + "episode": 3664, + "epoch": 0.02195300236066674, + "loss/policy_avg": 0.23512879014015198, + "lr": 9.854294478527608e-06, + "objective/entropy": 236.32894897460938, + "objective/kl": 31.84699058532715, + "objective/non_score_reward": -1.5923495292663574, + "objective/rlhf_reward": -8.36939811706543, + "objective/scores": -0.5, + "policy/approxkl_avg": 100.35233306884766, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.73046875, + "step": 228, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997907876968384 + }, + { + "episode": 3680, + "epoch": 0.022048866999796288, + "loss/policy_avg": -0.021687505766749382, + "lr": 9.853655419222905e-06, + "objective/entropy": 156.21514892578125, + "objective/kl": 33.45891189575195, + "objective/non_score_reward": -1.6729457378387451, + "objective/rlhf_reward": -6.691782712936401, + "objective/scores": 0.0, + "policy/approxkl_avg": 59.40328598022461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.69140625, + "step": 229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973727464675903 + }, + { + "episode": 3696, + "epoch": 0.022144731638925837, + "loss/policy_avg": 0.2921329736709595, + "lr": 9.853016359918202e-06, + "objective/entropy": 283.9156494140625, + "objective/kl": 28.28559112548828, + "objective/non_score_reward": -1.4142796993255615, + "objective/rlhf_reward": -5.657118558883667, + "objective/scores": 0.0, + "policy/approxkl_avg": 67.94718933105469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.876953125, + "step": 230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9960227012634277 + }, + { + "episode": 3712, + "epoch": 0.022240596278055386, + "loss/policy_avg": 0.05423973500728607, + "lr": 9.852377300613498e-06, + "objective/entropy": 138.2334442138672, + "objective/kl": 43.57646560668945, + "objective/non_score_reward": -2.1788229942321777, + "objective/rlhf_reward": -7.159033386912897, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 98.089111328125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6875, + "step": 231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993014335632324 + }, + { + "episode": 3728, + "epoch": 0.022336460917184935, + "loss/policy_avg": 0.19056108593940735, + "lr": 9.851738241308795e-06, + "objective/entropy": 10.413976669311523, + "objective/kl": 18.887348175048828, + "objective/non_score_reward": -0.9443674087524414, + "objective/rlhf_reward": -3.777469515800476, + "objective/scores": 0.0, + "policy/approxkl_avg": 105.73004150390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.59765625, + "step": 232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9962282180786133 + }, + { + "episode": 3744, + "epoch": 0.022432325556314484, + "loss/policy_avg": 0.4601524770259857, + "lr": 9.85109918200409e-06, + "objective/entropy": -74.81282043457031, + "objective/kl": 24.299213409423828, + "objective/non_score_reward": -1.2149605751037598, + "objective/rlhf_reward": -4.859842598438263, + "objective/scores": 0.0, + "policy/approxkl_avg": 34.33855056762695, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.638671875, + "step": 233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0012383460998535 + }, + { + "episode": 3760, + "epoch": 0.022528190195444033, + "loss/policy_avg": 0.40300655364990234, + "lr": 9.850460122699387e-06, + "objective/entropy": -45.57096481323242, + "objective/kl": 30.75171661376953, + "objective/non_score_reward": -1.537585973739624, + "objective/rlhf_reward": -8.15034294128418, + "objective/scores": -0.5, + "policy/approxkl_avg": 57.83643341064453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.791015625, + "step": 234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980592727661133 + }, + { + "episode": 3776, + "epoch": 0.02262405483457358, + "loss/policy_avg": 0.08251257985830307, + "lr": 9.849821063394683e-06, + "objective/entropy": -39.57012176513672, + "objective/kl": 21.46126937866211, + "objective/non_score_reward": -1.0730634927749634, + "objective/rlhf_reward": -2.9667412376701066, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 8.371784210205078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.591796875, + "step": 235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99739670753479 + }, + { + "episode": 3792, + "epoch": 0.02271991947370313, + "loss/policy_avg": 0.287686824798584, + "lr": 9.84918200408998e-06, + "objective/entropy": 225.470703125, + "objective/kl": 32.462642669677734, + "objective/non_score_reward": -1.6231322288513184, + "objective/rlhf_reward": -5.092528736591339, + "objective/scores": 0.35, + "policy/approxkl_avg": 233.72256469726562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.685546875, + "step": 236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972712993621826 + }, + { + "episode": 3808, + "epoch": 0.02281578411283268, + "loss/policy_avg": 0.40615230798721313, + "lr": 9.848542944785276e-06, + "objective/entropy": 138.72113037109375, + "objective/kl": 36.74411392211914, + "objective/non_score_reward": -1.8372057676315308, + "objective/rlhf_reward": -7.348823070526123, + "objective/scores": 0.0, + "policy/approxkl_avg": 68.71369934082031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51171875, + "step": 237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0015788078308105 + }, + { + "episode": 3824, + "epoch": 0.02291164875196223, + "loss/policy_avg": 0.2618522644042969, + "lr": 9.847903885480573e-06, + "objective/entropy": 218.24368286132812, + "objective/kl": 28.746074676513672, + "objective/non_score_reward": -1.4373037815093994, + "objective/rlhf_reward": -4.145094964567738, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 35.09134292602539, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9968454837799072 + }, + { + "episode": 3840, + "epoch": 0.023007513391091777, + "loss/policy_avg": 0.44723182916641235, + "lr": 9.84726482617587e-06, + "objective/entropy": 134.8599853515625, + "objective/kl": 30.93494415283203, + "objective/non_score_reward": -1.5467472076416016, + "objective/rlhf_reward": -4.630729882922724, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 72.10969543457031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.666015625, + "step": 239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997701644897461 + }, + { + "episode": 3856, + "epoch": 0.023103378030221326, + "loss/policy_avg": 1.2493870258331299, + "lr": 9.846625766871167e-06, + "objective/entropy": 101.4572525024414, + "objective/kl": 29.70997428894043, + "objective/non_score_reward": -1.4854986667633057, + "objective/rlhf_reward": -5.941995084285736, + "objective/scores": 0.0, + "policy/approxkl_avg": 100.62832641601562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.517578125, + "step": 240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9961397647857666 + }, + { + "episode": 3872, + "epoch": 0.023199242669350875, + "loss/policy_avg": 0.11165004968643188, + "lr": 9.845986707566462e-06, + "objective/entropy": -147.07489013671875, + "objective/kl": 25.88149642944336, + "objective/non_score_reward": -1.2940750122070312, + "objective/rlhf_reward": -7.176300048828125, + "objective/scores": -0.5, + "policy/approxkl_avg": 239.4694061279297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998302698135376 + }, + { + "episode": 3888, + "epoch": 0.023295107308480424, + "loss/policy_avg": 0.24919648468494415, + "lr": 9.845347648261759e-06, + "objective/entropy": -68.71859741210938, + "objective/kl": 27.074668884277344, + "objective/non_score_reward": -1.3537335395812988, + "objective/rlhf_reward": -7.414934158325195, + "objective/scores": -0.5, + "policy/approxkl_avg": 81.3831787109375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.443359375, + "step": 242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985148906707764 + }, + { + "episode": 3904, + "epoch": 0.023390971947609973, + "loss/policy_avg": 0.26694488525390625, + "lr": 9.844708588957056e-06, + "objective/entropy": 94.072265625, + "objective/kl": 28.564594268798828, + "objective/non_score_reward": -1.428229808807373, + "objective/rlhf_reward": -7.712919235229492, + "objective/scores": -0.5, + "policy/approxkl_avg": 219.86279296875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8515625, + "step": 243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9972708225250244 + }, + { + "episode": 3920, + "epoch": 0.023486836586739522, + "loss/policy_avg": 0.6054710149765015, + "lr": 9.844069529652353e-06, + "objective/entropy": 129.7139892578125, + "objective/kl": 30.57644271850586, + "objective/non_score_reward": -1.5288220643997192, + "objective/rlhf_reward": -8.115287780761719, + "objective/scores": -0.5, + "policy/approxkl_avg": 83.28874206542969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.712890625, + "step": 244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9941790103912354 + }, + { + "episode": 3936, + "epoch": 0.023582701225869074, + "loss/policy_avg": 0.12586408853530884, + "lr": 9.84343047034765e-06, + "objective/entropy": 241.99429321289062, + "objective/kl": 27.60189437866211, + "objective/non_score_reward": -1.3800947666168213, + "objective/rlhf_reward": -5.520378828048706, + "objective/scores": 0.0, + "policy/approxkl_avg": 73.93728637695312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.83984375, + "step": 245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990196228027344 + }, + { + "episode": 3952, + "epoch": 0.023678565864998623, + "loss/policy_avg": 0.4687037467956543, + "lr": 9.842791411042945e-06, + "objective/entropy": 90.39759826660156, + "objective/kl": 26.80643081665039, + "objective/non_score_reward": -1.3403215408325195, + "objective/rlhf_reward": -7.361286163330078, + "objective/scores": -0.5, + "policy/approxkl_avg": 122.53802490234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9375, + "step": 246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.99778151512146 + }, + { + "episode": 3968, + "epoch": 0.023774430504128172, + "loss/policy_avg": 0.7288471460342407, + "lr": 9.842152351738242e-06, + "objective/entropy": 111.89739227294922, + "objective/kl": 26.556848526000977, + "objective/non_score_reward": -1.3278424739837646, + "objective/rlhf_reward": -5.311370104551315, + "objective/scores": 0.0, + "policy/approxkl_avg": 153.5714874267578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58984375, + "step": 247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0046982765197754 + }, + { + "episode": 3984, + "epoch": 0.02387029514325772, + "loss/policy_avg": 0.4886789321899414, + "lr": 9.841513292433539e-06, + "objective/entropy": 143.58645629882812, + "objective/kl": 23.396575927734375, + "objective/non_score_reward": -1.1698288917541504, + "objective/rlhf_reward": -2.854486520561289, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 99.15058898925781, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.45703125, + "step": 248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984853267669678 + }, + { + "episode": 4000, + "epoch": 0.02396615978238727, + "loss/policy_avg": 0.20749720931053162, + "lr": 9.840874233128836e-06, + "objective/entropy": 34.73357391357422, + "objective/kl": 20.36009979248047, + "objective/non_score_reward": -1.0180050134658813, + "objective/rlhf_reward": -4.07202011346817, + "objective/scores": 0.0, + "policy/approxkl_avg": 159.967529296875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.580078125, + "step": 249, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0006356239318848 + }, + { + "episode": 4016, + "epoch": 0.02406202442151682, + "loss/policy_avg": 0.3030295670032501, + "lr": 9.840235173824132e-06, + "objective/entropy": -112.51934814453125, + "objective/kl": 33.881568908691406, + "objective/non_score_reward": -1.6940785646438599, + "objective/rlhf_reward": -4.951485450538705, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 146.98245239257812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988691806793213 + }, + { + "episode": 4032, + "epoch": 0.024157889060646368, + "loss/policy_avg": 0.6671163439750671, + "lr": 9.83959611451943e-06, + "objective/entropy": -65.78260803222656, + "objective/kl": 24.848434448242188, + "objective/non_score_reward": -1.2424218654632568, + "objective/rlhf_reward": -6.969687461853027, + "objective/scores": -0.5, + "policy/approxkl_avg": 93.62220764160156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.58203125, + "step": 251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9968496561050415 + }, + { + "episode": 4048, + "epoch": 0.024253753699775917, + "loss/policy_avg": 0.1528814733028412, + "lr": 9.838957055214724e-06, + "objective/entropy": 203.98094177246094, + "objective/kl": 40.56060791015625, + "objective/non_score_reward": -2.0280303955078125, + "objective/rlhf_reward": -10.11212158203125, + "objective/scores": -0.5, + "policy/approxkl_avg": 216.96200561523438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.935546875, + "step": 252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983346462249756 + }, + { + "episode": 4064, + "epoch": 0.024349618338905466, + "loss/policy_avg": 0.1612689346075058, + "lr": 9.838317995910021e-06, + "objective/entropy": 199.67910766601562, + "objective/kl": 17.4683780670166, + "objective/non_score_reward": -0.873418927192688, + "objective/rlhf_reward": 0.9063242912292484, + "objective/scores": 1.1, + "policy/approxkl_avg": 40.0599365234375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.685546875, + "step": 253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0007472038269043 + }, + { + "episode": 4080, + "epoch": 0.024445482978035015, + "loss/policy_avg": 0.1594327837228775, + "lr": 9.837678936605318e-06, + "objective/entropy": 272.66253662109375, + "objective/kl": 22.29098129272461, + "objective/non_score_reward": -1.114549160003662, + "objective/rlhf_reward": -3.058196461200714, + "objective/scores": 0.35, + "policy/approxkl_avg": 11.639923095703125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8359375, + "step": 254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991519451141357 + }, + { + "episode": 4096, + "epoch": 0.024541347617164564, + "loss/policy_avg": 0.08180014789104462, + "lr": 9.837039877300615e-06, + "objective/entropy": 67.96817779541016, + "objective/kl": 19.31763458251953, + "objective/non_score_reward": -0.9658817052841187, + "objective/rlhf_reward": -5.863526821136475, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.514694213867188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.556640625, + "step": 255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998260736465454 + }, + { + "episode": 4112, + "epoch": 0.024637212256294113, + "loss/policy_avg": 0.05669542774558067, + "lr": 9.83640081799591e-06, + "objective/entropy": 74.68458557128906, + "objective/kl": 24.917343139648438, + "objective/non_score_reward": -1.2458672523498535, + "objective/rlhf_reward": -3.158640022548746, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 20.251989364624023, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.60546875, + "step": 256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000245571136475 + }, + { + "episode": 4128, + "epoch": 0.024733076895423662, + "loss/policy_avg": -0.04459148645401001, + "lr": 9.835761758691207e-06, + "objective/entropy": 44.43208312988281, + "objective/kl": 26.832380294799805, + "objective/non_score_reward": -1.3416190147399902, + "objective/rlhf_reward": -3.9878734732545436, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 121.80049896240234, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.61328125, + "step": 257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0049991607666016 + }, + { + "episode": 4144, + "epoch": 0.02482894153455321, + "loss/policy_avg": 0.2643028497695923, + "lr": 9.835122699386504e-06, + "objective/entropy": -58.66691207885742, + "objective/kl": 19.387344360351562, + "objective/non_score_reward": -0.969367265701294, + "objective/rlhf_reward": -5.877469062805176, + "objective/scores": -0.5, + "policy/approxkl_avg": 45.993797302246094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5859375, + "step": 258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9965884685516357 + }, + { + "episode": 4160, + "epoch": 0.02492480617368276, + "loss/policy_avg": 0.6535857915878296, + "lr": 9.8344836400818e-06, + "objective/entropy": 142.896484375, + "objective/kl": 22.385143280029297, + "objective/non_score_reward": -1.1192572116851807, + "objective/rlhf_reward": -6.477028846740723, + "objective/scores": -0.5, + "policy/approxkl_avg": 159.0670166015625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.90625, + "step": 259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975087642669678 + }, + { + "episode": 4176, + "epoch": 0.02502067081281231, + "loss/policy_avg": 0.06705514341592789, + "lr": 9.833844580777096e-06, + "objective/entropy": 85.35755920410156, + "objective/kl": 42.735443115234375, + "objective/non_score_reward": -2.1367719173431396, + "objective/rlhf_reward": -6.8852281622296445, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 59.29423522949219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.513671875, + "step": 260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998096227645874 + }, + { + "episode": 4192, + "epoch": 0.025116535451941858, + "loss/policy_avg": 0.16437175869941711, + "lr": 9.833205521472393e-06, + "objective/entropy": 238.88308715820312, + "objective/kl": 35.51251983642578, + "objective/non_score_reward": -1.7756261825561523, + "objective/rlhf_reward": -9.10250473022461, + "objective/scores": -0.5, + "policy/approxkl_avg": 87.11653137207031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.75390625, + "step": 261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9962289333343506 + }, + { + "episode": 4208, + "epoch": 0.025212400091071407, + "loss/policy_avg": 0.2615561783313751, + "lr": 9.83256646216769e-06, + "objective/entropy": 165.4353790283203, + "objective/kl": 31.884780883789062, + "objective/non_score_reward": -1.5942389965057373, + "objective/rlhf_reward": -4.772836122576313, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 145.626708984375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.689453125, + "step": 262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9968907833099365 + }, + { + "episode": 4224, + "epoch": 0.025308264730200956, + "loss/policy_avg": 0.1498258411884308, + "lr": 9.831927402862987e-06, + "objective/entropy": 114.43228149414062, + "objective/kl": 34.159423828125, + "objective/non_score_reward": -1.7079713344573975, + "objective/rlhf_reward": -8.831884384155273, + "objective/scores": -0.5, + "policy/approxkl_avg": 120.08966827392578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6640625, + "step": 263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.995566964149475 + }, + { + "episode": 4240, + "epoch": 0.025404129369330505, + "loss/policy_avg": 0.9565318822860718, + "lr": 9.831288343558284e-06, + "objective/entropy": 30.085983276367188, + "objective/kl": 30.190189361572266, + "objective/non_score_reward": -1.5095094442367554, + "objective/rlhf_reward": -6.038037717342377, + "objective/scores": 0.0, + "policy/approxkl_avg": 30.665681838989258, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.548828125, + "step": 264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984924793243408 + }, + { + "episode": 4256, + "epoch": 0.025499994008460054, + "loss/policy_avg": 0.1617015153169632, + "lr": 9.830649284253579e-06, + "objective/entropy": 85.59569549560547, + "objective/kl": 29.51198959350586, + "objective/non_score_reward": -1.4755992889404297, + "objective/rlhf_reward": -1.5023974239826199, + "objective/scores": 1.1, + "policy/approxkl_avg": 238.41380310058594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62109375, + "step": 265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998697280883789 + }, + { + "episode": 4272, + "epoch": 0.025595858647589603, + "loss/policy_avg": 0.12634433805942535, + "lr": 9.830010224948876e-06, + "objective/entropy": -17.845001220703125, + "objective/kl": 23.098552703857422, + "objective/non_score_reward": -1.1549276113510132, + "objective/rlhf_reward": -6.619710445404053, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.69245147705078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.564453125, + "step": 266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993417263031006 + }, + { + "episode": 4288, + "epoch": 0.02569172328671915, + "loss/policy_avg": 0.1535305678844452, + "lr": 9.829371165644173e-06, + "objective/entropy": 149.2012481689453, + "objective/kl": 30.30670928955078, + "objective/non_score_reward": -1.5153354406356812, + "objective/rlhf_reward": -4.545570218356785, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 55.40291976928711, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.517578125, + "step": 267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999010682106018 + }, + { + "episode": 4304, + "epoch": 0.0257875879258487, + "loss/policy_avg": 0.3865639567375183, + "lr": 9.82873210633947e-06, + "objective/entropy": 96.54017639160156, + "objective/kl": 32.002784729003906, + "objective/non_score_reward": -1.6001390218734741, + "objective/rlhf_reward": -8.400556564331055, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.64997100830078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5859375, + "step": 268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976623058319092 + }, + { + "episode": 4320, + "epoch": 0.02588345256497825, + "loss/policy_avg": -0.003022553399205208, + "lr": 9.828093047034766e-06, + "objective/entropy": 184.17962646484375, + "objective/kl": 34.98113250732422, + "objective/non_score_reward": -1.7490566968917847, + "objective/rlhf_reward": -5.480455153974232, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 177.02108764648438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58984375, + "step": 269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0039427280426025 + }, + { + "episode": 4336, + "epoch": 0.0259793172041078, + "loss/policy_avg": 0.22940891981124878, + "lr": 9.827453987730061e-06, + "objective/entropy": 97.79884338378906, + "objective/kl": 28.352216720581055, + "objective/non_score_reward": -1.4176108837127686, + "objective/rlhf_reward": -5.670443296432495, + "objective/scores": 0.0, + "policy/approxkl_avg": 50.27137756347656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7734375, + "step": 270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.995865821838379 + }, + { + "episode": 4352, + "epoch": 0.026075181843237347, + "loss/policy_avg": 0.5798227190971375, + "lr": 9.826814928425358e-06, + "objective/entropy": 114.09043884277344, + "objective/kl": 30.19118309020996, + "objective/non_score_reward": -1.509559154510498, + "objective/rlhf_reward": -6.038236498832703, + "objective/scores": 0.0, + "policy/approxkl_avg": 34.826072692871094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0023837089538574 + }, + { + "episode": 4368, + "epoch": 0.026171046482366896, + "loss/policy_avg": -0.021535426378250122, + "lr": 9.826175869120655e-06, + "objective/entropy": -37.564857482910156, + "objective/kl": 18.688800811767578, + "objective/non_score_reward": -0.9344400763511658, + "objective/rlhf_reward": -5.737760543823242, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.649078369140625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859375, + "step": 272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990510940551758 + }, + { + "episode": 4384, + "epoch": 0.02626691112149645, + "loss/policy_avg": 0.22535109519958496, + "lr": 9.825536809815952e-06, + "objective/entropy": 92.80372619628906, + "objective/kl": 36.460060119628906, + "objective/non_score_reward": -1.8230029344558716, + "objective/rlhf_reward": -5.9327618715509605, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 71.66783142089844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6875, + "step": 273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996490716934204 + }, + { + "episode": 4400, + "epoch": 0.026362775760625998, + "loss/policy_avg": 0.1513216346502304, + "lr": 9.824897750511249e-06, + "objective/entropy": 107.88948059082031, + "objective/kl": 27.020774841308594, + "objective/non_score_reward": -1.3510388135910034, + "objective/rlhf_reward": -5.404155135154724, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.633958339691162, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.353515625, + "step": 274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999776840209961 + }, + { + "episode": 4416, + "epoch": 0.026458640399755547, + "loss/policy_avg": 0.2180587649345398, + "lr": 9.824258691206546e-06, + "objective/entropy": 237.6265411376953, + "objective/kl": 34.5337028503418, + "objective/non_score_reward": -1.7266850471496582, + "objective/rlhf_reward": -5.350481002536371, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 64.67985534667969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7578125, + "step": 275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0010175704956055 + }, + { + "episode": 4432, + "epoch": 0.026554505038885096, + "loss/policy_avg": 0.1868075728416443, + "lr": 9.823619631901841e-06, + "objective/entropy": 153.34646606445312, + "objective/kl": 37.78309631347656, + "objective/non_score_reward": -1.8891546726226807, + "objective/rlhf_reward": -7.556619048118591, + "objective/scores": 0.0, + "policy/approxkl_avg": 104.59550476074219, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.814453125, + "step": 276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994056224822998 + }, + { + "episode": 4448, + "epoch": 0.026650369678014645, + "loss/policy_avg": 0.9853407144546509, + "lr": 9.822980572597138e-06, + "objective/entropy": 241.00967407226562, + "objective/kl": 34.986572265625, + "objective/non_score_reward": -1.74932861328125, + "objective/rlhf_reward": -5.393194708887654, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 219.53729248046875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7421875, + "step": 277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9966635704040527 + }, + { + "episode": 4464, + "epoch": 0.026746234317144194, + "loss/policy_avg": -0.05517375469207764, + "lr": 9.822341513292433e-06, + "objective/entropy": -5.935462951660156, + "objective/kl": 28.799835205078125, + "objective/non_score_reward": -1.4399919509887695, + "objective/rlhf_reward": -7.75996732711792, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.517549514770508, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.623046875, + "step": 278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0015547275543213 + }, + { + "episode": 4480, + "epoch": 0.026842098956273742, + "loss/policy_avg": 0.5763638019561768, + "lr": 9.82170245398773e-06, + "objective/entropy": 109.34716796875, + "objective/kl": 33.456336975097656, + "objective/non_score_reward": -1.6728168725967407, + "objective/rlhf_reward": -6.691267490386963, + "objective/scores": 0.0, + "policy/approxkl_avg": 87.05907440185547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.529296875, + "step": 279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9927139282226562 + }, + { + "episode": 4496, + "epoch": 0.02693796359540329, + "loss/policy_avg": 0.060494083911180496, + "lr": 9.821063394683027e-06, + "objective/entropy": 73.89436340332031, + "objective/kl": 30.13658905029297, + "objective/non_score_reward": -1.5068295001983643, + "objective/rlhf_reward": -8.027318000793457, + "objective/scores": -0.5, + "policy/approxkl_avg": 238.1710662841797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975123405456543 + }, + { + "episode": 4512, + "epoch": 0.02703382823453284, + "loss/policy_avg": 1.1403638124465942, + "lr": 9.820424335378324e-06, + "objective/entropy": 78.37907409667969, + "objective/kl": 35.09062194824219, + "objective/non_score_reward": -1.7545311450958252, + "objective/rlhf_reward": -7.018124580383301, + "objective/scores": 0.0, + "policy/approxkl_avg": 51.29010772705078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.544921875, + "step": 281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980716705322266 + }, + { + "episode": 4528, + "epoch": 0.02712969287366239, + "loss/policy_avg": 0.0838393121957779, + "lr": 9.81978527607362e-06, + "objective/entropy": 12.182060241699219, + "objective/kl": 30.722957611083984, + "objective/non_score_reward": -1.5361478328704834, + "objective/rlhf_reward": -4.197180102543767, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 42.573402404785156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.609375, + "step": 282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99773371219635 + }, + { + "episode": 4544, + "epoch": 0.02722555751279194, + "loss/policy_avg": 0.10264723747968674, + "lr": 9.819146216768916e-06, + "objective/entropy": -15.691246032714844, + "objective/kl": 31.322179794311523, + "objective/non_score_reward": -1.5661091804504395, + "objective/rlhf_reward": -8.264436721801758, + "objective/scores": -0.5, + "policy/approxkl_avg": 216.12973022460938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.77734375, + "step": 283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982627630233765 + }, + { + "episode": 4560, + "epoch": 0.027321422151921487, + "loss/policy_avg": -0.03270050510764122, + "lr": 9.818507157464213e-06, + "objective/entropy": 139.46694946289062, + "objective/kl": 25.990922927856445, + "objective/non_score_reward": -1.299546241760254, + "objective/rlhf_reward": -3.5940649843850903, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 66.3836441040039, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.62109375, + "step": 284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0002379417419434 + }, + { + "episode": 4576, + "epoch": 0.027417286791051036, + "loss/policy_avg": -0.08705548942089081, + "lr": 9.81786809815951e-06, + "objective/entropy": 166.04830932617188, + "objective/kl": 24.707944869995117, + "objective/non_score_reward": -1.235397219657898, + "objective/rlhf_reward": -3.5415888786315914, + "objective/scores": 0.35, + "policy/approxkl_avg": 21.18993377685547, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.630859375, + "step": 285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989569187164307 + }, + { + "episode": 4592, + "epoch": 0.027513151430180585, + "loss/policy_avg": 0.03873559087514877, + "lr": 9.817229038854806e-06, + "objective/entropy": -7.895801544189453, + "objective/kl": 26.817386627197266, + "objective/non_score_reward": -1.340869426727295, + "objective/rlhf_reward": -3.9634773492813107, + "objective/scores": 0.35, + "policy/approxkl_avg": 94.96324920654297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.689453125, + "step": 286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000836849212646 + }, + { + "episode": 4608, + "epoch": 0.027609016069310134, + "loss/policy_avg": 0.3947087824344635, + "lr": 9.816589979550103e-06, + "objective/entropy": 31.713714599609375, + "objective/kl": 35.37312698364258, + "objective/non_score_reward": -1.7686563730239868, + "objective/rlhf_reward": -7.074625730514526, + "objective/scores": 0.0, + "policy/approxkl_avg": 117.319091796875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55078125, + "step": 287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998248815536499 + }, + { + "episode": 4624, + "epoch": 0.027704880708439683, + "loss/policy_avg": -0.04924429580569267, + "lr": 9.8159509202454e-06, + "objective/entropy": 213.8193817138672, + "objective/kl": 33.728729248046875, + "objective/non_score_reward": -1.686436653137207, + "objective/rlhf_reward": -2.3457467317581173, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.397890090942383, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8125, + "step": 288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001190662384033 + }, + { + "episode": 4640, + "epoch": 0.027800745347569232, + "loss/policy_avg": 0.3702055513858795, + "lr": 9.815311860940695e-06, + "objective/entropy": 60.47701644897461, + "objective/kl": 28.081138610839844, + "objective/non_score_reward": -1.4040570259094238, + "objective/rlhf_reward": -5.616227865219116, + "objective/scores": 0.0, + "policy/approxkl_avg": 141.1427001953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.52734375, + "step": 289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983237981796265 + }, + { + "episode": 4656, + "epoch": 0.02789660998669878, + "loss/policy_avg": 0.6483702659606934, + "lr": 9.814672801635992e-06, + "objective/entropy": 56.31958770751953, + "objective/kl": 32.672027587890625, + "objective/non_score_reward": -1.633601427078247, + "objective/rlhf_reward": -8.534405708312988, + "objective/scores": -0.5, + "policy/approxkl_avg": 82.03401184082031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7890625, + "step": 290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9977704286575317 + }, + { + "episode": 4672, + "epoch": 0.02799247462582833, + "loss/policy_avg": 0.2940463721752167, + "lr": 9.81403374233129e-06, + "objective/entropy": 116.48851013183594, + "objective/kl": 25.498628616333008, + "objective/non_score_reward": -1.2749314308166504, + "objective/rlhf_reward": -7.099725246429443, + "objective/scores": -0.5, + "policy/approxkl_avg": 66.54810333251953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.80859375, + "step": 291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973530769348145 + }, + { + "episode": 4688, + "epoch": 0.02808833926495788, + "loss/policy_avg": 0.28353065252304077, + "lr": 9.813394683026586e-06, + "objective/entropy": -60.268192291259766, + "objective/kl": 31.477249145507812, + "objective/non_score_reward": -1.5738624334335327, + "objective/rlhf_reward": -6.295449614524841, + "objective/scores": 0.0, + "policy/approxkl_avg": 35.035682678222656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.888671875, + "step": 292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0013890266418457 + }, + { + "episode": 4704, + "epoch": 0.028184203904087428, + "loss/policy_avg": 0.35568001866340637, + "lr": 9.812755623721883e-06, + "objective/entropy": 126.98811340332031, + "objective/kl": 31.8903865814209, + "objective/non_score_reward": -1.5945194959640503, + "objective/rlhf_reward": -8.37807846069336, + "objective/scores": -0.5, + "policy/approxkl_avg": 254.9582977294922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4189453125, + "step": 293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979784488677979 + }, + { + "episode": 4720, + "epoch": 0.028280068543216977, + "loss/policy_avg": 0.2607693076133728, + "lr": 9.81211656441718e-06, + "objective/entropy": 25.59099578857422, + "objective/kl": 27.727397918701172, + "objective/non_score_reward": -1.3863699436187744, + "objective/rlhf_reward": -5.545479655265808, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.56276273727417, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985105991363525 + }, + { + "episode": 4736, + "epoch": 0.028375933182346526, + "loss/policy_avg": 0.09589973092079163, + "lr": 9.811477505112475e-06, + "objective/entropy": 60.00609588623047, + "objective/kl": 28.59209442138672, + "objective/non_score_reward": -1.4296045303344727, + "objective/rlhf_reward": -4.40773727556169, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 5.154585361480713, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.568359375, + "step": 295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0015292167663574 + }, + { + "episode": 4752, + "epoch": 0.028471797821476075, + "loss/policy_avg": -0.23297792673110962, + "lr": 9.810838445807772e-06, + "objective/entropy": 172.1509552001953, + "objective/kl": 28.202377319335938, + "objective/non_score_reward": -1.4101189374923706, + "objective/rlhf_reward": -5.640475809574127, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.175811767578125, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.619140625, + "step": 296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0159595012664795 + }, + { + "episode": 4768, + "epoch": 0.028567662460605624, + "loss/policy_avg": 0.24692611396312714, + "lr": 9.810199386503069e-06, + "objective/entropy": 82.19457244873047, + "objective/kl": 23.421173095703125, + "objective/non_score_reward": -1.1710586547851562, + "objective/rlhf_reward": -0.2842347383499142, + "objective/scores": 1.1, + "policy/approxkl_avg": 28.198326110839844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.765625, + "step": 297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000041961669922 + }, + { + "episode": 4784, + "epoch": 0.028663527099735173, + "loss/policy_avg": 0.2046826034784317, + "lr": 9.809560327198366e-06, + "objective/entropy": 98.7589111328125, + "objective/kl": 31.546274185180664, + "objective/non_score_reward": -1.5773136615753174, + "objective/rlhf_reward": -8.309255599975586, + "objective/scores": -0.5, + "policy/approxkl_avg": 203.54513549804688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.697265625, + "step": 298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9957282543182373 + }, + { + "episode": 4800, + "epoch": 0.02875939173886472, + "loss/policy_avg": 0.24512505531311035, + "lr": 9.808921267893663e-06, + "objective/entropy": 206.71981811523438, + "objective/kl": 26.81441307067871, + "objective/non_score_reward": -1.3407206535339355, + "objective/rlhf_reward": -5.362882316112518, + "objective/scores": 0.0, + "policy/approxkl_avg": 163.9492950439453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.796875, + "step": 299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996509313583374 + }, + { + "episode": 4816, + "epoch": 0.02885525637799427, + "loss/policy_avg": 0.028275877237319946, + "lr": 9.808282208588958e-06, + "objective/entropy": -18.974689483642578, + "objective/kl": 37.04328536987305, + "objective/non_score_reward": -1.852164387702942, + "objective/rlhf_reward": -9.40865707397461, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.29435348510742, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.666015625, + "step": 300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9966996908187866 + }, + { + "episode": 4832, + "epoch": 0.02895112101712382, + "loss/policy_avg": 0.1949348747730255, + "lr": 9.807643149284255e-06, + "objective/entropy": 83.58306121826172, + "objective/kl": 37.85429000854492, + "objective/non_score_reward": -1.8927146196365356, + "objective/rlhf_reward": -9.570858001708984, + "objective/scores": -0.5, + "policy/approxkl_avg": 69.484619140625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983818531036377 + }, + { + "episode": 4848, + "epoch": 0.029046985656253372, + "loss/policy_avg": -0.1876036524772644, + "lr": 9.80700408997955e-06, + "objective/entropy": -33.166542053222656, + "objective/kl": 29.514928817749023, + "objective/non_score_reward": -1.4757463932037354, + "objective/rlhf_reward": -5.902985334396362, + "objective/scores": 0.0, + "policy/approxkl_avg": 21.677946090698242, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.58984375, + "step": 302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998695731163025 + }, + { + "episode": 4864, + "epoch": 0.02914285029538292, + "loss/policy_avg": 0.022164881229400635, + "lr": 9.806365030674847e-06, + "objective/entropy": 94.1938247680664, + "objective/kl": 34.645530700683594, + "objective/non_score_reward": -1.7322763204574585, + "objective/rlhf_reward": -8.929105758666992, + "objective/scores": -0.5, + "policy/approxkl_avg": 211.6139373779297, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.779296875, + "step": 303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997363805770874 + }, + { + "episode": 4880, + "epoch": 0.02923871493451247, + "loss/policy_avg": 0.7907856106758118, + "lr": 9.805725971370144e-06, + "objective/entropy": 63.40114212036133, + "objective/kl": 32.65860366821289, + "objective/non_score_reward": -1.6329302787780762, + "objective/rlhf_reward": -6.531721353530884, + "objective/scores": 0.0, + "policy/approxkl_avg": 12.615059852600098, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8515625, + "step": 304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0000805854797363 + }, + { + "episode": 4896, + "epoch": 0.02933457957364202, + "loss/policy_avg": 0.07148364931344986, + "lr": 9.80508691206544e-06, + "objective/entropy": 88.27102661132812, + "objective/kl": 39.98832702636719, + "objective/non_score_reward": -1.9994162321090698, + "objective/rlhf_reward": -5.073946271778318, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 129.564208984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.533203125, + "step": 305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998765468597412 + }, + { + "episode": 4912, + "epoch": 0.029430444212771568, + "loss/policy_avg": 0.17122961580753326, + "lr": 9.804447852760737e-06, + "objective/entropy": -49.38551330566406, + "objective/kl": 24.57117462158203, + "objective/non_score_reward": -1.2285586595535278, + "objective/rlhf_reward": -6.914234638214111, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.128786087036133, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.755859375, + "step": 306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9974303245544434 + }, + { + "episode": 4928, + "epoch": 0.029526308851901117, + "loss/policy_avg": 0.2127828598022461, + "lr": 9.803808793456034e-06, + "objective/entropy": 75.18860626220703, + "objective/kl": 26.683685302734375, + "objective/non_score_reward": -1.3341842889785767, + "objective/rlhf_reward": -5.336737275123596, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.522137641906738, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6875, + "step": 307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001485824584961 + }, + { + "episode": 4944, + "epoch": 0.029622173491030666, + "loss/policy_avg": -0.11516772210597992, + "lr": 9.80316973415133e-06, + "objective/entropy": -0.04103279113769531, + "objective/kl": 38.64908218383789, + "objective/non_score_reward": -1.932453989982605, + "objective/rlhf_reward": -6.067956691206085, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 4.6061835289001465, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.724609375, + "step": 308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000288248062134 + }, + { + "episode": 4960, + "epoch": 0.029718038130160215, + "loss/policy_avg": 0.2119406759738922, + "lr": 9.802530674846626e-06, + "objective/entropy": 145.13497924804688, + "objective/kl": 37.7593879699707, + "objective/non_score_reward": -1.8879692554473877, + "objective/rlhf_reward": -9.55187702178955, + "objective/scores": -0.5, + "policy/approxkl_avg": 189.22616577148438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.447265625, + "step": 309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981553554534912 + }, + { + "episode": 4976, + "epoch": 0.029813902769289764, + "loss/policy_avg": 0.3452683091163635, + "lr": 9.801891615541923e-06, + "objective/entropy": 311.71026611328125, + "objective/kl": 26.775503158569336, + "objective/non_score_reward": -1.3387750387191772, + "objective/rlhf_reward": -5.3551002740859985, + "objective/scores": 0.0, + "policy/approxkl_avg": 134.57427978515625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.978515625, + "step": 310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99644136428833 + }, + { + "episode": 4992, + "epoch": 0.029909767408419313, + "loss/policy_avg": -0.21734583377838135, + "lr": 9.80125255623722e-06, + "objective/entropy": -49.33317184448242, + "objective/kl": 24.155736923217773, + "objective/non_score_reward": -1.2077867984771729, + "objective/rlhf_reward": -4.8311474323272705, + "objective/scores": 0.0, + "policy/approxkl_avg": 31.801761627197266, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.59765625, + "step": 311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999210834503174 + }, + { + "episode": 5008, + "epoch": 0.03000563204754886, + "loss/policy_avg": -0.4189864993095398, + "lr": 9.800613496932517e-06, + "objective/entropy": 176.72503662109375, + "objective/kl": 36.288841247558594, + "objective/non_score_reward": -1.8144421577453613, + "objective/rlhf_reward": -5.833936293323603, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 94.41351318359375, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.552734375, + "step": 312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.006214141845703 + }, + { + "episode": 5024, + "epoch": 0.03010149668667841, + "loss/policy_avg": 0.5682752132415771, + "lr": 9.799974437627812e-06, + "objective/entropy": 71.7433090209961, + "objective/kl": 31.988750457763672, + "objective/non_score_reward": -1.5994374752044678, + "objective/rlhf_reward": -8.397749900817871, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.52442932128906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.662109375, + "step": 313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989948272705078 + }, + { + "episode": 5040, + "epoch": 0.03019736132580796, + "loss/policy_avg": 0.5441787242889404, + "lr": 9.799335378323109e-06, + "objective/entropy": 214.158447265625, + "objective/kl": 31.045618057250977, + "objective/non_score_reward": -1.5522809028625488, + "objective/rlhf_reward": -1.8091239094734188, + "objective/scores": 1.1, + "policy/approxkl_avg": 58.665191650390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.876953125, + "step": 314, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988675117492676 + }, + { + "episode": 5056, + "epoch": 0.03029322596493751, + "loss/policy_avg": 0.5254380702972412, + "lr": 9.798696319018406e-06, + "objective/entropy": -19.816749572753906, + "objective/kl": 39.83454895019531, + "objective/non_score_reward": -1.991727590560913, + "objective/rlhf_reward": -5.043191228748533, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 79.64820861816406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6328125, + "step": 315, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973756074905396 + }, + { + "episode": 5072, + "epoch": 0.030389090604067057, + "loss/policy_avg": 0.2726515531539917, + "lr": 9.798057259713703e-06, + "objective/entropy": -4.6614837646484375, + "objective/kl": 35.06428527832031, + "objective/non_score_reward": -1.7532143592834473, + "objective/rlhf_reward": -9.012857437133789, + "objective/scores": -0.5, + "policy/approxkl_avg": 54.6142578125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5234375, + "step": 316, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981968402862549 + }, + { + "episode": 5088, + "epoch": 0.030484955243196606, + "loss/policy_avg": 0.26141488552093506, + "lr": 9.797418200409e-06, + "objective/entropy": 47.011985778808594, + "objective/kl": 45.61205291748047, + "objective/non_score_reward": -2.28060245513916, + "objective/rlhf_reward": -11.12240982055664, + "objective/scores": -0.5, + "policy/approxkl_avg": 180.7010498046875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.634765625, + "step": 317, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997007131576538 + }, + { + "episode": 5104, + "epoch": 0.030580819882326155, + "loss/policy_avg": 0.4122789800167084, + "lr": 9.796779141104296e-06, + "objective/entropy": 94.4898452758789, + "objective/kl": 28.151004791259766, + "objective/non_score_reward": -1.4075502157211304, + "objective/rlhf_reward": -7.6302008628845215, + "objective/scores": -0.5, + "policy/approxkl_avg": 103.5096664428711, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.615234375, + "step": 318, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992344379425049 + }, + { + "episode": 5120, + "epoch": 0.030676684521455704, + "loss/policy_avg": 0.0029612816870212555, + "lr": 9.796140081799592e-06, + "objective/entropy": 198.75570678710938, + "objective/kl": 25.085525512695312, + "objective/non_score_reward": -1.2542762756347656, + "objective/rlhf_reward": -5.017104983329773, + "objective/scores": 0.0, + "policy/approxkl_avg": 120.64035034179688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.84375, + "step": 319, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988009929656982 + }, + { + "episode": 5136, + "epoch": 0.030772549160585253, + "loss/policy_avg": 0.3600286841392517, + "lr": 9.795501022494888e-06, + "objective/entropy": 42.533729553222656, + "objective/kl": 28.457992553710938, + "objective/non_score_reward": -1.422899603843689, + "objective/rlhf_reward": -5.691598415374756, + "objective/scores": 0.0, + "policy/approxkl_avg": 73.802490234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5234375, + "step": 320, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9940855503082275 + }, + { + "episode": 5152, + "epoch": 0.030868413799714802, + "loss/policy_avg": -0.13510574400424957, + "lr": 9.794861963190185e-06, + "objective/entropy": -5.396385192871094, + "objective/kl": 22.229045867919922, + "objective/non_score_reward": -1.111452341079712, + "objective/rlhf_reward": -2.9952114923909754, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 45.65439987182617, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 321, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999731183052063 + }, + { + "episode": 5168, + "epoch": 0.03096427843884435, + "loss/policy_avg": -0.15009954571723938, + "lr": 9.794222903885482e-06, + "objective/entropy": -7.345497131347656, + "objective/kl": 32.8961181640625, + "objective/non_score_reward": -1.644805908203125, + "objective/rlhf_reward": -8.5792236328125, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.027180671691895, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.525390625, + "step": 322, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002540111541748 + }, + { + "episode": 5184, + "epoch": 0.0310601430779739, + "loss/policy_avg": -0.0007271356880664825, + "lr": 9.793583844580777e-06, + "objective/entropy": 163.2379608154297, + "objective/kl": 29.110525131225586, + "objective/non_score_reward": -1.455526351928711, + "objective/rlhf_reward": -5.822105169296265, + "objective/scores": 0.0, + "policy/approxkl_avg": 22.514135360717773, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4892578125, + "step": 323, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.010396718978882 + }, + { + "episode": 5200, + "epoch": 0.03115600771710345, + "loss/policy_avg": 0.19244720041751862, + "lr": 9.792944785276074e-06, + "objective/entropy": -23.184280395507812, + "objective/kl": 28.509687423706055, + "objective/non_score_reward": -1.4254844188690186, + "objective/rlhf_reward": -1.3019374370574948, + "objective/scores": 1.1, + "policy/approxkl_avg": 88.21593475341797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.734375, + "step": 324, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9985356330871582 + }, + { + "episode": 5216, + "epoch": 0.031251872356233, + "loss/policy_avg": 0.12977905571460724, + "lr": 9.792305725971371e-06, + "objective/entropy": 150.05477905273438, + "objective/kl": 31.353958129882812, + "objective/non_score_reward": -1.5676978826522827, + "objective/rlhf_reward": -6.270791292190552, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.6948184967041, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.474609375, + "step": 325, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000549077987671 + }, + { + "episode": 5232, + "epoch": 0.03134773699536255, + "loss/policy_avg": 1.3926464319229126, + "lr": 9.791666666666666e-06, + "objective/entropy": 145.20407104492188, + "objective/kl": 43.132911682128906, + "objective/non_score_reward": -2.1566455364227295, + "objective/rlhf_reward": -10.626582145690918, + "objective/scores": -0.5, + "policy/approxkl_avg": 143.59188842773438, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.703125, + "step": 326, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001438617706299 + }, + { + "episode": 5248, + "epoch": 0.0314436016344921, + "loss/policy_avg": -0.06288231909275055, + "lr": 9.791027607361963e-06, + "objective/entropy": -0.5038261413574219, + "objective/kl": 29.548839569091797, + "objective/non_score_reward": -1.4774420261383057, + "objective/rlhf_reward": -4.584255132704897, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 33.57347869873047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4208984375, + "step": 327, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0011537075042725 + }, + { + "episode": 5264, + "epoch": 0.03153946627362165, + "loss/policy_avg": 0.04387858510017395, + "lr": 9.79038854805726e-06, + "objective/entropy": 33.670188903808594, + "objective/kl": 19.800825119018555, + "objective/non_score_reward": -0.9900413751602173, + "objective/rlhf_reward": -5.960165500640869, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.26099967956543, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 328, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9964679479599 + }, + { + "episode": 5280, + "epoch": 0.0316353309127512, + "loss/policy_avg": 0.08830268681049347, + "lr": 9.789749488752557e-06, + "objective/entropy": 77.01775360107422, + "objective/kl": 33.558563232421875, + "objective/non_score_reward": -1.6779282093048096, + "objective/rlhf_reward": -8.711712837219238, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.013957977294922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 329, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998348593711853 + }, + { + "episode": 5296, + "epoch": 0.031731195551880746, + "loss/policy_avg": 0.39634984731674194, + "lr": 9.789110429447854e-06, + "objective/entropy": -55.59328079223633, + "objective/kl": 27.393354415893555, + "objective/non_score_reward": -1.3696677684783936, + "objective/rlhf_reward": -5.47867077589035, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.402725219726562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.525390625, + "step": 330, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996036291122437 + }, + { + "episode": 5312, + "epoch": 0.031827060191010295, + "loss/policy_avg": 0.28659260272979736, + "lr": 9.78847137014315e-06, + "objective/entropy": -92.15465545654297, + "objective/kl": 37.35984802246094, + "objective/non_score_reward": -1.8679924011230469, + "objective/rlhf_reward": -7.471969723701477, + "objective/scores": 0.0, + "policy/approxkl_avg": 128.93292236328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 331, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999650239944458 + }, + { + "episode": 5328, + "epoch": 0.031922924830139844, + "loss/policy_avg": 0.04176933690905571, + "lr": 9.787832310838446e-06, + "objective/entropy": 134.93870544433594, + "objective/kl": 36.91297149658203, + "objective/non_score_reward": -1.8456485271453857, + "objective/rlhf_reward": -9.38259506225586, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.31844711303711, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.806640625, + "step": 332, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973440170288086 + }, + { + "episode": 5344, + "epoch": 0.03201878946926939, + "loss/policy_avg": 0.12372894585132599, + "lr": 9.787193251533743e-06, + "objective/entropy": 71.03854370117188, + "objective/kl": 31.68985366821289, + "objective/non_score_reward": -1.5844929218292236, + "objective/rlhf_reward": -8.337971687316895, + "objective/scores": -0.5, + "policy/approxkl_avg": 47.733943939208984, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.783203125, + "step": 333, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001028060913086 + }, + { + "episode": 5360, + "epoch": 0.03211465410839894, + "loss/policy_avg": 0.2358803153038025, + "lr": 9.78655419222904e-06, + "objective/entropy": 115.29727172851562, + "objective/kl": 28.339174270629883, + "objective/non_score_reward": -1.4169588088989258, + "objective/rlhf_reward": -7.667835235595703, + "objective/scores": -0.5, + "policy/approxkl_avg": 44.00404357910156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.80859375, + "step": 334, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.99520742893219 + }, + { + "episode": 5376, + "epoch": 0.03221051874752849, + "loss/policy_avg": 0.023874476552009583, + "lr": 9.785915132924337e-06, + "objective/entropy": -10.510528564453125, + "objective/kl": 35.899017333984375, + "objective/non_score_reward": -1.7949509620666504, + "objective/rlhf_reward": -9.179803848266602, + "objective/scores": -0.5, + "policy/approxkl_avg": 134.71612548828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.642578125, + "step": 335, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987369775772095 + }, + { + "episode": 5392, + "epoch": 0.03230638338665804, + "loss/policy_avg": 0.2157980352640152, + "lr": 9.785276073619633e-06, + "objective/entropy": -117.77426147460938, + "objective/kl": 36.19778060913086, + "objective/non_score_reward": -1.8098891973495483, + "objective/rlhf_reward": -7.239556908607483, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.148270606994629, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.66015625, + "step": 336, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979586601257324 + }, + { + "episode": 5408, + "epoch": 0.03240224802578759, + "loss/policy_avg": 0.5062718987464905, + "lr": 9.784637014314929e-06, + "objective/entropy": 2.911235809326172, + "objective/kl": 29.895044326782227, + "objective/non_score_reward": -1.494752287864685, + "objective/rlhf_reward": -4.668328216275572, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 57.235164642333984, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.58984375, + "step": 337, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989079236984253 + }, + { + "episode": 5424, + "epoch": 0.03249811266491714, + "loss/policy_avg": 0.35139840841293335, + "lr": 9.783997955010226e-06, + "objective/entropy": 17.953224182128906, + "objective/kl": 31.853174209594727, + "objective/non_score_reward": -1.5926587581634521, + "objective/rlhf_reward": -4.766515169207173, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 21.75585174560547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.787109375, + "step": 338, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9962561130523682 + }, + { + "episode": 5440, + "epoch": 0.03259397730404669, + "loss/policy_avg": 0.5215449333190918, + "lr": 9.783358895705522e-06, + "objective/entropy": -65.21415710449219, + "objective/kl": 41.145973205566406, + "objective/non_score_reward": -2.0572988986968994, + "objective/rlhf_reward": -10.229194641113281, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.21241760253906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.75, + "step": 339, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977318048477173 + }, + { + "episode": 5456, + "epoch": 0.032689841943176236, + "loss/policy_avg": 0.28369078040122986, + "lr": 9.78271983640082e-06, + "objective/entropy": 94.98407745361328, + "objective/kl": 35.06120300292969, + "objective/non_score_reward": -1.7530601024627686, + "objective/rlhf_reward": -9.01224136352539, + "objective/scores": -0.5, + "policy/approxkl_avg": 24.02906036376953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.587890625, + "step": 340, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989793300628662 + }, + { + "episode": 5472, + "epoch": 0.032785706582305785, + "loss/policy_avg": 1.9119699001312256, + "lr": 9.782080777096116e-06, + "objective/entropy": 103.6617202758789, + "objective/kl": 43.41075134277344, + "objective/non_score_reward": -2.1705374717712402, + "objective/rlhf_reward": -8.68215036392212, + "objective/scores": 0.0, + "policy/approxkl_avg": 27.09801483154297, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.548828125, + "step": 341, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000924825668335 + }, + { + "episode": 5488, + "epoch": 0.032881571221435334, + "loss/policy_avg": 0.5120334625244141, + "lr": 9.781441717791413e-06, + "objective/entropy": 167.12762451171875, + "objective/kl": 35.880027770996094, + "objective/non_score_reward": -1.7940013408660889, + "objective/rlhf_reward": -9.176005363464355, + "objective/scores": -0.5, + "policy/approxkl_avg": 70.14405822753906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.630859375, + "step": 342, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0025668144226074 + }, + { + "episode": 5504, + "epoch": 0.03297743586056488, + "loss/policy_avg": 0.015120374038815498, + "lr": 9.780802658486708e-06, + "objective/entropy": 205.39503479003906, + "objective/kl": 41.50642395019531, + "objective/non_score_reward": -2.0753211975097656, + "objective/rlhf_reward": -8.301284790039062, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8751791715621948, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7890625, + "step": 343, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0046072006225586 + }, + { + "episode": 5520, + "epoch": 0.03307330049969443, + "loss/policy_avg": 1.035445213317871, + "lr": 9.780163599182005e-06, + "objective/entropy": 91.37589263916016, + "objective/kl": 35.44560241699219, + "objective/non_score_reward": -1.7722800970077515, + "objective/rlhf_reward": -5.638522367091522, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 18.435420989990234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 344, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999558925628662 + }, + { + "episode": 5536, + "epoch": 0.03316916513882398, + "loss/policy_avg": 0.07220157980918884, + "lr": 9.7795245398773e-06, + "objective/entropy": 109.74974822998047, + "objective/kl": 30.194711685180664, + "objective/non_score_reward": -1.5097355842590332, + "objective/rlhf_reward": -4.660340347377163, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 8.997635841369629, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.498046875, + "step": 345, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9951503276824951 + }, + { + "episode": 5552, + "epoch": 0.03326502977795353, + "loss/policy_avg": -0.0515998937189579, + "lr": 9.778885480572597e-06, + "objective/entropy": 139.16102600097656, + "objective/kl": 23.493303298950195, + "objective/non_score_reward": -1.1746652126312256, + "objective/rlhf_reward": -4.698660850524902, + "objective/scores": 0.0, + "policy/approxkl_avg": 26.937942504882812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.654296875, + "step": 346, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000459909439087 + }, + { + "episode": 5568, + "epoch": 0.03336089441708308, + "loss/policy_avg": 0.5854986906051636, + "lr": 9.778246421267894e-06, + "objective/entropy": -27.624618530273438, + "objective/kl": 25.353118896484375, + "objective/non_score_reward": -1.2676559686660767, + "objective/rlhf_reward": -3.466503713194447, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 65.4859619140625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 347, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9998695850372314 + }, + { + "episode": 5584, + "epoch": 0.03345675905621263, + "loss/policy_avg": 0.13264235854148865, + "lr": 9.777607361963191e-06, + "objective/entropy": 180.84451293945312, + "objective/kl": 29.817018508911133, + "objective/non_score_reward": -1.4908509254455566, + "objective/rlhf_reward": -4.60415395471899, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 229.28366088867188, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.74609375, + "step": 348, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972574710845947 + }, + { + "episode": 5600, + "epoch": 0.033552623695342176, + "loss/policy_avg": 0.0762765109539032, + "lr": 9.776968302658488e-06, + "objective/entropy": 60.79357147216797, + "objective/kl": 42.37275314331055, + "objective/non_score_reward": -2.1186375617980957, + "objective/rlhf_reward": -10.474550247192383, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.9780604839324951, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.595703125, + "step": 349, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007505416870117 + }, + { + "episode": 5616, + "epoch": 0.033648488334471725, + "loss/policy_avg": 0.056807953864336014, + "lr": 9.776329243353783e-06, + "objective/entropy": 52.78253173828125, + "objective/kl": 34.426055908203125, + "objective/non_score_reward": -1.7213029861450195, + "objective/rlhf_reward": -5.404259326870799, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 43.55577087402344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.578125, + "step": 350, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9966363906860352 + }, + { + "episode": 5632, + "epoch": 0.033744352973601274, + "loss/policy_avg": -0.12831875681877136, + "lr": 9.77569018404908e-06, + "objective/entropy": 16.329566955566406, + "objective/kl": 32.15143585205078, + "objective/non_score_reward": -1.6075717210769653, + "objective/rlhf_reward": -8.430286407470703, + "objective/scores": -0.5, + "policy/approxkl_avg": 75.677978515625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.50390625, + "step": 351, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998267412185669 + }, + { + "episode": 5648, + "epoch": 0.03384021761273082, + "loss/policy_avg": 1.4532546997070312, + "lr": 9.775051124744377e-06, + "objective/entropy": 74.94012451171875, + "objective/kl": 27.538991928100586, + "objective/non_score_reward": -1.3769495487213135, + "objective/rlhf_reward": -7.507798194885254, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.15715026855469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.75, + "step": 352, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0019326210021973 + }, + { + "episode": 5664, + "epoch": 0.03393608225186037, + "loss/policy_avg": 0.18699043989181519, + "lr": 9.774412065439674e-06, + "objective/entropy": 154.23028564453125, + "objective/kl": 26.423828125, + "objective/non_score_reward": -1.3211913108825684, + "objective/rlhf_reward": -7.284765243530273, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.985931396484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4609375, + "step": 353, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997018575668335 + }, + { + "episode": 5680, + "epoch": 0.03403194689098992, + "loss/policy_avg": 0.4292873740196228, + "lr": 9.77377300613497e-06, + "objective/entropy": 69.04763793945312, + "objective/kl": 29.13799285888672, + "objective/non_score_reward": -1.456899642944336, + "objective/rlhf_reward": -7.8275980949401855, + "objective/scores": -0.5, + "policy/approxkl_avg": 24.5076904296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 354, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984169006347656 + }, + { + "episode": 5696, + "epoch": 0.03412781153011947, + "loss/policy_avg": 0.3388688564300537, + "lr": 9.773133946830267e-06, + "objective/entropy": 138.8565673828125, + "objective/kl": 36.0447998046875, + "objective/non_score_reward": -1.802240014076233, + "objective/rlhf_reward": -7.208960175514221, + "objective/scores": 0.0, + "policy/approxkl_avg": 74.22533416748047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.79296875, + "step": 355, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992218017578125 + }, + { + "episode": 5712, + "epoch": 0.03422367616924902, + "loss/policy_avg": 0.12512633204460144, + "lr": 9.772494887525563e-06, + "objective/entropy": 74.25532531738281, + "objective/kl": 22.747737884521484, + "objective/non_score_reward": -1.1373867988586426, + "objective/rlhf_reward": -6.54954719543457, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.5538787841796875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 356, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998750925064087 + }, + { + "episode": 5728, + "epoch": 0.03431954080837857, + "loss/policy_avg": 0.19646649062633514, + "lr": 9.77185582822086e-06, + "objective/entropy": 91.78384399414062, + "objective/kl": 33.680171966552734, + "objective/non_score_reward": -1.6840085983276367, + "objective/rlhf_reward": -5.357431986419064, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 234.2763214111328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.521484375, + "step": 357, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99714994430542 + }, + { + "episode": 5744, + "epoch": 0.03441540544750812, + "loss/policy_avg": 0.1719483882188797, + "lr": 9.771216768916156e-06, + "objective/entropy": 15.523513793945312, + "objective/kl": 30.582561492919922, + "objective/non_score_reward": -1.529128074645996, + "objective/rlhf_reward": -6.116512298583984, + "objective/scores": 0.0, + "policy/approxkl_avg": 51.08443832397461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.408203125, + "step": 358, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0013043880462646 + }, + { + "episode": 5760, + "epoch": 0.034511270086637666, + "loss/policy_avg": 0.24180445075035095, + "lr": 9.770577709611453e-06, + "objective/entropy": 118.51576232910156, + "objective/kl": 42.176387786865234, + "objective/non_score_reward": -2.1088194847106934, + "objective/rlhf_reward": -8.435277700424194, + "objective/scores": 0.0, + "policy/approxkl_avg": 118.47047424316406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.583984375, + "step": 359, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.995426893234253 + }, + { + "episode": 5776, + "epoch": 0.034607134725767215, + "loss/policy_avg": 0.43327879905700684, + "lr": 9.76993865030675e-06, + "objective/entropy": 136.4874267578125, + "objective/kl": 45.31035232543945, + "objective/non_score_reward": -2.2655177116394043, + "objective/rlhf_reward": -9.062070846557617, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.986034393310547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.396484375, + "step": 360, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967424869537354 + }, + { + "episode": 5792, + "epoch": 0.034702999364896764, + "loss/policy_avg": 0.19567860662937164, + "lr": 9.769299591002045e-06, + "objective/entropy": 42.139869689941406, + "objective/kl": 34.16746520996094, + "objective/non_score_reward": -1.7083733081817627, + "objective/rlhf_reward": -6.83349347114563, + "objective/scores": 0.0, + "policy/approxkl_avg": 63.83492660522461, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6875, + "step": 361, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9962726831436157 + }, + { + "episode": 5808, + "epoch": 0.03479886400402631, + "loss/policy_avg": 0.22219505906105042, + "lr": 9.768660531697342e-06, + "objective/entropy": 58.672523498535156, + "objective/kl": 36.7076416015625, + "objective/non_score_reward": -1.8353819847106934, + "objective/rlhf_reward": -5.825756037028965, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 122.09330749511719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4423828125, + "step": 362, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996779441833496 + }, + { + "episode": 5824, + "epoch": 0.03489472864315586, + "loss/policy_avg": 0.15133829414844513, + "lr": 9.768021472392639e-06, + "objective/entropy": 178.90162658691406, + "objective/kl": 45.38115692138672, + "objective/non_score_reward": -2.2690577507019043, + "objective/rlhf_reward": -4.676230645179748, + "objective/scores": 1.1, + "policy/approxkl_avg": 104.61646270751953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.529296875, + "step": 363, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996953010559082 + }, + { + "episode": 5840, + "epoch": 0.03499059328228541, + "loss/policy_avg": -0.021383460611104965, + "lr": 9.767382413087936e-06, + "objective/entropy": -69.16898345947266, + "objective/kl": 22.05933380126953, + "objective/non_score_reward": -1.1029666662216187, + "objective/rlhf_reward": -2.289160372988258, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 29.110021591186523, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 364, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00262188911438 + }, + { + "episode": 5856, + "epoch": 0.03508645792141496, + "loss/policy_avg": -0.04033146798610687, + "lr": 9.766743353783233e-06, + "objective/entropy": 141.72869873046875, + "objective/kl": 44.84279251098633, + "objective/non_score_reward": -2.2421395778656006, + "objective/rlhf_reward": -10.968558311462402, + "objective/scores": -0.5, + "policy/approxkl_avg": 186.49794006347656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.74609375, + "step": 365, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999026894569397 + }, + { + "episode": 5872, + "epoch": 0.03518232256054451, + "loss/policy_avg": 0.4079732298851013, + "lr": 9.76610429447853e-06, + "objective/entropy": 111.8157958984375, + "objective/kl": 36.629844665527344, + "objective/non_score_reward": -1.8314921855926514, + "objective/rlhf_reward": -9.325968742370605, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.762906551361084, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.751953125, + "step": 366, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0008926391601562 + }, + { + "episode": 5888, + "epoch": 0.03527818719967406, + "loss/policy_avg": 0.04947970062494278, + "lr": 9.765465235173825e-06, + "objective/entropy": 191.2625274658203, + "objective/kl": 34.450801849365234, + "objective/non_score_reward": -1.722540259361267, + "objective/rlhf_reward": -6.890160799026489, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.018008232116699, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.537109375, + "step": 367, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001461982727051 + }, + { + "episode": 5904, + "epoch": 0.03537405183880361, + "loss/policy_avg": 0.0687071904540062, + "lr": 9.764826175869122e-06, + "objective/entropy": 83.29997253417969, + "objective/kl": 22.66602325439453, + "objective/non_score_reward": -1.1333011388778687, + "objective/rlhf_reward": -4.533204555511475, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5386806726455688, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.353515625, + "step": 368, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999902248382568 + }, + { + "episode": 5920, + "epoch": 0.035469916477933155, + "loss/policy_avg": 0.2985497713088989, + "lr": 9.764187116564417e-06, + "objective/entropy": 139.81521606445312, + "objective/kl": 24.542152404785156, + "objective/non_score_reward": -1.2271076440811157, + "objective/rlhf_reward": -4.908430516719818, + "objective/scores": 0.0, + "policy/approxkl_avg": 34.6478271484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41796875, + "step": 369, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0018105506896973 + }, + { + "episode": 5936, + "epoch": 0.035565781117062704, + "loss/policy_avg": 0.08180458098649979, + "lr": 9.763548057259714e-06, + "objective/entropy": -5.604040145874023, + "objective/kl": 37.0791015625, + "objective/non_score_reward": -1.8539552688598633, + "objective/rlhf_reward": -5.859561531749323, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.4335670471191406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.580078125, + "step": 370, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005955696105957 + }, + { + "episode": 5952, + "epoch": 0.03566164575619225, + "loss/policy_avg": 0.506417453289032, + "lr": 9.76290899795501e-06, + "objective/entropy": 35.1672248840332, + "objective/kl": 32.360355377197266, + "objective/non_score_reward": -1.618017554283142, + "objective/rlhf_reward": -5.161389222344756, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 189.36500549316406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.615234375, + "step": 371, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977048635482788 + }, + { + "episode": 5968, + "epoch": 0.0357575103953218, + "loss/policy_avg": 0.265750527381897, + "lr": 9.762269938650308e-06, + "objective/entropy": 194.55935668945312, + "objective/kl": 37.323184967041016, + "objective/non_score_reward": -1.866159200668335, + "objective/rlhf_reward": -3.064636981487274, + "objective/scores": 1.1, + "policy/approxkl_avg": 69.688232421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 372, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976245164871216 + }, + { + "episode": 5984, + "epoch": 0.03585337503445135, + "loss/policy_avg": 0.7971226572990417, + "lr": 9.761630879345604e-06, + "objective/entropy": 8.025199890136719, + "objective/kl": 27.826885223388672, + "objective/non_score_reward": -1.3913441896438599, + "objective/rlhf_reward": -5.5653769969940186, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.6349968910217285, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4296875, + "step": 373, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0026164054870605 + }, + { + "episode": 6000, + "epoch": 0.03594923967358091, + "loss/policy_avg": 0.40386438369750977, + "lr": 9.7609918200409e-06, + "objective/entropy": -74.85362243652344, + "objective/kl": 34.3883056640625, + "objective/non_score_reward": -1.7194151878356934, + "objective/rlhf_reward": -6.8776609897613525, + "objective/scores": 0.0, + "policy/approxkl_avg": 34.913692474365234, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.646484375, + "step": 374, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9994592666625977 + }, + { + "episode": 6016, + "epoch": 0.036045104312710456, + "loss/policy_avg": 0.13621872663497925, + "lr": 9.760352760736196e-06, + "objective/entropy": 66.13349914550781, + "objective/kl": 35.28841781616211, + "objective/non_score_reward": -1.764420986175537, + "objective/rlhf_reward": -5.324350611368815, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 29.487491607666016, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4521484375, + "step": 375, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004875659942627 + }, + { + "episode": 6032, + "epoch": 0.036140968951840005, + "loss/policy_avg": 1.1029133796691895, + "lr": 9.759713701431493e-06, + "objective/entropy": 152.87005615234375, + "objective/kl": 33.37676239013672, + "objective/non_score_reward": -1.6688382625579834, + "objective/rlhf_reward": -4.9420197168986, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 139.82919311523438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.80078125, + "step": 376, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9985812902450562 + }, + { + "episode": 6048, + "epoch": 0.036236833590969554, + "loss/policy_avg": 0.11807486414909363, + "lr": 9.75907464212679e-06, + "objective/entropy": 217.25425720214844, + "objective/kl": 36.396339416503906, + "objective/non_score_reward": -1.819817066192627, + "objective/rlhf_reward": -9.279268264770508, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.34674835205078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.626953125, + "step": 377, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962363243103027 + }, + { + "episode": 6064, + "epoch": 0.0363326982300991, + "loss/policy_avg": 0.007799604907631874, + "lr": 9.758435582822087e-06, + "objective/entropy": 101.9019546508789, + "objective/kl": 46.355655670166016, + "objective/non_score_reward": -2.3177828788757324, + "objective/rlhf_reward": -11.27113151550293, + "objective/scores": -0.5, + "policy/approxkl_avg": 68.56651306152344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4853515625, + "step": 378, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000476360321045 + }, + { + "episode": 6080, + "epoch": 0.03642856286922865, + "loss/policy_avg": -0.08341000974178314, + "lr": 9.757796523517384e-06, + "objective/entropy": 89.2676010131836, + "objective/kl": 46.89963912963867, + "objective/non_score_reward": -2.3449819087982178, + "objective/rlhf_reward": -9.37992775440216, + "objective/scores": 0.0, + "policy/approxkl_avg": 182.4227294921875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.583984375, + "step": 379, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0021252632141113 + }, + { + "episode": 6096, + "epoch": 0.0365244275083582, + "loss/policy_avg": 0.34633713960647583, + "lr": 9.75715746421268e-06, + "objective/entropy": 111.65666198730469, + "objective/kl": 32.17061233520508, + "objective/non_score_reward": -1.6085307598114014, + "objective/rlhf_reward": -6.434122920036316, + "objective/scores": 0.0, + "policy/approxkl_avg": 88.91949462890625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.568359375, + "step": 380, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998034954071045 + }, + { + "episode": 6112, + "epoch": 0.03662029214748775, + "loss/policy_avg": 0.4559730887413025, + "lr": 9.756518404907976e-06, + "objective/entropy": -2.1730079650878906, + "objective/kl": 24.72875213623047, + "objective/non_score_reward": -1.2364375591278076, + "objective/rlhf_reward": -4.945750296115875, + "objective/scores": 0.0, + "policy/approxkl_avg": 28.68434715270996, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6171875, + "step": 381, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009069442749023 + }, + { + "episode": 6128, + "epoch": 0.0367161567866173, + "loss/policy_avg": 0.255852073431015, + "lr": 9.755879345603273e-06, + "objective/entropy": 189.11827087402344, + "objective/kl": 39.26380157470703, + "objective/non_score_reward": -1.9631900787353516, + "objective/rlhf_reward": -7.852760314941406, + "objective/scores": 0.0, + "policy/approxkl_avg": 43.19601821899414, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.603515625, + "step": 382, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998410940170288 + }, + { + "episode": 6144, + "epoch": 0.03681202142574685, + "loss/policy_avg": 0.07754311710596085, + "lr": 9.75524028629857e-06, + "objective/entropy": 128.18150329589844, + "objective/kl": 30.46546173095703, + "objective/non_score_reward": -1.523273229598999, + "objective/rlhf_reward": -4.431233172834502, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 10.884984970092773, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.466796875, + "step": 383, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0032522678375244 + }, + { + "episode": 6160, + "epoch": 0.0369078860648764, + "loss/policy_avg": 0.021326124668121338, + "lr": 9.754601226993867e-06, + "objective/entropy": 110.02630615234375, + "objective/kl": 46.91279602050781, + "objective/non_score_reward": -2.345640182495117, + "objective/rlhf_reward": -11.382560729980469, + "objective/scores": -0.5, + "policy/approxkl_avg": 92.21537780761719, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4365234375, + "step": 384, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984705448150635 + }, + { + "episode": 6176, + "epoch": 0.037003750704005946, + "loss/policy_avg": -0.04232418164610863, + "lr": 9.753962167689162e-06, + "objective/entropy": 150.83932495117188, + "objective/kl": 36.70783996582031, + "objective/non_score_reward": -1.8353919982910156, + "objective/rlhf_reward": -7.341568350791931, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.093977928161621, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4365234375, + "step": 385, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997746467590332 + }, + { + "episode": 6192, + "epoch": 0.037099615343135495, + "loss/policy_avg": -0.19166389107704163, + "lr": 9.753323108384459e-06, + "objective/entropy": 171.91030883789062, + "objective/kl": 34.13121032714844, + "objective/non_score_reward": -1.7065606117248535, + "objective/rlhf_reward": -8.826242446899414, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.414439678192139, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.537109375, + "step": 386, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.011042356491089 + }, + { + "episode": 6208, + "epoch": 0.037195479982265044, + "loss/policy_avg": 0.1640317738056183, + "lr": 9.752684049079756e-06, + "objective/entropy": 48.400291442871094, + "objective/kl": 50.87938690185547, + "objective/non_score_reward": -2.54396915435791, + "objective/rlhf_reward": -8.351048584255288, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 37.014671325683594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5234375, + "step": 387, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981615543365479 + }, + { + "episode": 6224, + "epoch": 0.03729134462139459, + "loss/policy_avg": 0.3416689932346344, + "lr": 9.752044989775053e-06, + "objective/entropy": 75.96147155761719, + "objective/kl": 41.44689178466797, + "objective/non_score_reward": -2.0723445415496826, + "objective/rlhf_reward": -10.28937816619873, + "objective/scores": -0.5, + "policy/approxkl_avg": 87.05730438232422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 388, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997666358947754 + }, + { + "episode": 6240, + "epoch": 0.03738720926052414, + "loss/policy_avg": 0.43460220098495483, + "lr": 9.751405930470348e-06, + "objective/entropy": 77.63334655761719, + "objective/kl": 35.82466125488281, + "objective/non_score_reward": -1.7912328243255615, + "objective/rlhf_reward": -5.805681669448299, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 26.701194763183594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3876953125, + "step": 389, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001871109008789 + }, + { + "episode": 6256, + "epoch": 0.03748307389965369, + "loss/policy_avg": -0.16133618354797363, + "lr": 9.750766871165645e-06, + "objective/entropy": 118.75211334228516, + "objective/kl": 38.7314453125, + "objective/non_score_reward": -1.9365723133087158, + "objective/rlhf_reward": -9.746289253234863, + "objective/scores": -0.5, + "policy/approxkl_avg": 171.99203491210938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3916015625, + "step": 390, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972796440124512 + }, + { + "episode": 6272, + "epoch": 0.03757893853878324, + "loss/policy_avg": 0.3922348618507385, + "lr": 9.750127811860941e-06, + "objective/entropy": 102.3033447265625, + "objective/kl": 32.2958869934082, + "objective/non_score_reward": -1.6147942543029785, + "objective/rlhf_reward": -8.459177017211914, + "objective/scores": -0.5, + "policy/approxkl_avg": 48.286659240722656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.796875, + "step": 391, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978159666061401 + }, + { + "episode": 6288, + "epoch": 0.03767480317791279, + "loss/policy_avg": 0.17723676562309265, + "lr": 9.749488752556238e-06, + "objective/entropy": 22.37232208251953, + "objective/kl": 27.597850799560547, + "objective/non_score_reward": -1.3798925876617432, + "objective/rlhf_reward": -5.519570171833038, + "objective/scores": 0.0, + "policy/approxkl_avg": 56.990116119384766, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4443359375, + "step": 392, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986002445220947 + }, + { + "episode": 6304, + "epoch": 0.03777066781704234, + "loss/policy_avg": 0.7708048820495605, + "lr": 9.748849693251534e-06, + "objective/entropy": 177.01199340820312, + "objective/kl": 42.79792404174805, + "objective/non_score_reward": -2.1398961544036865, + "objective/rlhf_reward": -4.159584856033325, + "objective/scores": 1.1, + "policy/approxkl_avg": 3.181797504425049, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.693359375, + "step": 393, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0018231868743896 + }, + { + "episode": 6320, + "epoch": 0.037866532456171886, + "loss/policy_avg": 0.34089791774749756, + "lr": 9.74821063394683e-06, + "objective/entropy": 140.12313842773438, + "objective/kl": 46.36503219604492, + "objective/non_score_reward": -2.318251609802246, + "objective/rlhf_reward": -7.894404628363949, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 204.4562225341797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.412109375, + "step": 394, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007781982421875 + }, + { + "episode": 6336, + "epoch": 0.037962397095301435, + "loss/policy_avg": 0.16855208575725555, + "lr": 9.747571574642127e-06, + "objective/entropy": 142.73866271972656, + "objective/kl": 39.905517578125, + "objective/non_score_reward": -1.9952759742736816, + "objective/rlhf_reward": -9.981103897094727, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.86113739013672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4619140625, + "step": 395, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989800453186035 + }, + { + "episode": 6352, + "epoch": 0.038058261734430984, + "loss/policy_avg": -0.01784345507621765, + "lr": 9.746932515337424e-06, + "objective/entropy": 206.17181396484375, + "objective/kl": 50.423553466796875, + "objective/non_score_reward": -2.5211775302886963, + "objective/rlhf_reward": -12.084710121154785, + "objective/scores": -0.5, + "policy/approxkl_avg": 78.37747192382812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.64453125, + "step": 396, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0067272186279297 + }, + { + "episode": 6368, + "epoch": 0.03815412637356053, + "loss/policy_avg": -0.05958561599254608, + "lr": 9.746293456032721e-06, + "objective/entropy": 76.2546157836914, + "objective/kl": 21.187191009521484, + "objective/non_score_reward": -1.0593595504760742, + "objective/rlhf_reward": -2.5041048685709635, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 19.284114837646484, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.666015625, + "step": 397, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0066661834716797 + }, + { + "episode": 6384, + "epoch": 0.03824999101269008, + "loss/policy_avg": -0.023946866393089294, + "lr": 9.745654396728016e-06, + "objective/entropy": 62.24019241333008, + "objective/kl": 30.506431579589844, + "objective/non_score_reward": -1.5253217220306396, + "objective/rlhf_reward": -8.101286888122559, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.33515930175781, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.609375, + "step": 398, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000063180923462 + }, + { + "episode": 6400, + "epoch": 0.03834585565181963, + "loss/policy_avg": -0.22125929594039917, + "lr": 9.745015337423313e-06, + "objective/entropy": 106.82333374023438, + "objective/kl": 38.80064392089844, + "objective/non_score_reward": -1.9400321245193481, + "objective/rlhf_reward": -9.76012897491455, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.642780303955078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.662109375, + "step": 399, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998152256011963 + }, + { + "episode": 6416, + "epoch": 0.03844172029094918, + "loss/policy_avg": 1.0083184242248535, + "lr": 9.74437627811861e-06, + "objective/entropy": -8.957130432128906, + "objective/kl": 45.45331573486328, + "objective/non_score_reward": -2.2726659774780273, + "objective/rlhf_reward": -9.09066355228424, + "objective/scores": 0.0, + "policy/approxkl_avg": 336.0928955078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.453125, + "step": 400, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9965978860855103 + }, + { + "episode": 6432, + "epoch": 0.03853758493007873, + "loss/policy_avg": 0.06534097343683243, + "lr": 9.743737218813907e-06, + "objective/entropy": 43.69670486450195, + "objective/kl": 37.32368469238281, + "objective/non_score_reward": -1.866184115409851, + "objective/rlhf_reward": -9.464736938476562, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.217670440673828, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.578125, + "step": 401, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000572443008423 + }, + { + "episode": 6448, + "epoch": 0.03863344956920828, + "loss/policy_avg": 0.21769657731056213, + "lr": 9.743098159509204e-06, + "objective/entropy": 117.73030090332031, + "objective/kl": 35.21237564086914, + "objective/non_score_reward": -1.7606186866760254, + "objective/rlhf_reward": -9.042474746704102, + "objective/scores": -0.5, + "policy/approxkl_avg": 106.69755554199219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5625, + "step": 402, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000271797180176 + }, + { + "episode": 6464, + "epoch": 0.03872931420833783, + "loss/policy_avg": 0.2948653995990753, + "lr": 9.7424591002045e-06, + "objective/entropy": 122.77405548095703, + "objective/kl": 32.650047302246094, + "objective/non_score_reward": -1.6325020790100098, + "objective/rlhf_reward": -5.204495821028871, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 234.55484008789062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.37890625, + "step": 403, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999098539352417 + }, + { + "episode": 6480, + "epoch": 0.038825178847467376, + "loss/policy_avg": 0.051352113485336304, + "lr": 9.741820040899796e-06, + "objective/entropy": -68.59423065185547, + "objective/kl": 38.657508850097656, + "objective/non_score_reward": -1.932875633239746, + "objective/rlhf_reward": -9.731502532958984, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.453775405883789, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 404, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975395202636719 + }, + { + "episode": 6496, + "epoch": 0.038921043486596925, + "loss/policy_avg": 0.16383764147758484, + "lr": 9.741180981595093e-06, + "objective/entropy": 180.20687866210938, + "objective/kl": 46.885032653808594, + "objective/non_score_reward": -2.3442516326904297, + "objective/rlhf_reward": -11.377006530761719, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.65456008911133, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.890625, + "step": 405, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9967621564865112 + }, + { + "episode": 6512, + "epoch": 0.039016908125726474, + "loss/policy_avg": 0.13604994118213654, + "lr": 9.74054192229039e-06, + "objective/entropy": 3.9914321899414062, + "objective/kl": 40.065860748291016, + "objective/non_score_reward": -2.003293037414551, + "objective/rlhf_reward": -10.013172149658203, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.099180221557617, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.505859375, + "step": 406, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998462200164795 + }, + { + "episode": 6528, + "epoch": 0.03911277276485602, + "loss/policy_avg": 0.3104326128959656, + "lr": 9.739902862985686e-06, + "objective/entropy": 70.04134368896484, + "objective/kl": 36.113067626953125, + "objective/non_score_reward": -1.8056533336639404, + "objective/rlhf_reward": -5.880977442770629, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 103.19497680664062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.611328125, + "step": 407, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986114501953125 + }, + { + "episode": 6544, + "epoch": 0.03920863740398557, + "loss/policy_avg": 8.922710418701172, + "lr": 9.739263803680983e-06, + "objective/entropy": 75.4478759765625, + "objective/kl": 50.82036209106445, + "objective/non_score_reward": -2.541018009185791, + "objective/rlhf_reward": -10.164072275161743, + "objective/scores": 0.0, + "policy/approxkl_avg": 78.79092407226562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.591796875, + "step": 408, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998524188995361 + }, + { + "episode": 6560, + "epoch": 0.03930450204311512, + "loss/policy_avg": 0.2832748293876648, + "lr": 9.73862474437628e-06, + "objective/entropy": 109.94105529785156, + "objective/kl": 39.018890380859375, + "objective/non_score_reward": -1.9509445428848267, + "objective/rlhf_reward": -9.803777694702148, + "objective/scores": -0.5, + "policy/approxkl_avg": 85.17427062988281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3232421875, + "step": 409, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999758005142212 + }, + { + "episode": 6576, + "epoch": 0.03940036668224467, + "loss/policy_avg": 1.1405491828918457, + "lr": 9.737985685071575e-06, + "objective/entropy": 178.7044677734375, + "objective/kl": 34.637672424316406, + "objective/non_score_reward": -1.7318837642669678, + "objective/rlhf_reward": -8.927535057067871, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.027103424072266, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.650390625, + "step": 410, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000344753265381 + }, + { + "episode": 6592, + "epoch": 0.03949623132137422, + "loss/policy_avg": -0.1532156616449356, + "lr": 9.737346625766872e-06, + "objective/entropy": 82.60413360595703, + "objective/kl": 30.524627685546875, + "objective/non_score_reward": -1.5262314081192017, + "objective/rlhf_reward": -6.104925632476807, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.448099136352539, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.494140625, + "step": 411, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0018105506896973 + }, + { + "episode": 6608, + "epoch": 0.03959209596050377, + "loss/policy_avg": 0.17056474089622498, + "lr": 9.736707566462167e-06, + "objective/entropy": 157.149658203125, + "objective/kl": 33.411128997802734, + "objective/non_score_reward": -1.6705564260482788, + "objective/rlhf_reward": -6.682225704193115, + "objective/scores": 0.0, + "policy/approxkl_avg": 184.52642822265625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.72265625, + "step": 412, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977160692214966 + }, + { + "episode": 6624, + "epoch": 0.039687960599633317, + "loss/policy_avg": 0.25223565101623535, + "lr": 9.736068507157464e-06, + "objective/entropy": 48.83440017700195, + "objective/kl": 31.99204444885254, + "objective/non_score_reward": -1.599602222442627, + "objective/rlhf_reward": -8.398408889770508, + "objective/scores": -0.5, + "policy/approxkl_avg": 53.256534576416016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3837890625, + "step": 413, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977399110794067 + }, + { + "episode": 6640, + "epoch": 0.039783825238762865, + "loss/policy_avg": 0.1266993284225464, + "lr": 9.735429447852761e-06, + "objective/entropy": 159.01785278320312, + "objective/kl": 31.140743255615234, + "objective/non_score_reward": -1.5570372343063354, + "objective/rlhf_reward": -6.228149056434631, + "objective/scores": 0.0, + "policy/approxkl_avg": 101.76364135742188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4931640625, + "step": 414, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9946467876434326 + }, + { + "episode": 6656, + "epoch": 0.039879689877892414, + "loss/policy_avg": 0.22998680174350739, + "lr": 9.734790388548058e-06, + "objective/entropy": 46.99758529663086, + "objective/kl": 35.97903823852539, + "objective/non_score_reward": -1.7989518642425537, + "objective/rlhf_reward": -9.195807456970215, + "objective/scores": -0.5, + "policy/approxkl_avg": 68.40435791015625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 415, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0022130012512207 + }, + { + "episode": 6672, + "epoch": 0.03997555451702196, + "loss/policy_avg": 0.1837829202413559, + "lr": 9.734151329243355e-06, + "objective/entropy": 249.3802490234375, + "objective/kl": 43.223350524902344, + "objective/non_score_reward": -2.1611673831939697, + "objective/rlhf_reward": -8.644670009613037, + "objective/scores": 0.0, + "policy/approxkl_avg": 47.855064392089844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.78125, + "step": 416, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992127418518066 + }, + { + "episode": 6688, + "epoch": 0.04007141915615151, + "loss/policy_avg": 0.035887204110622406, + "lr": 9.73351226993865e-06, + "objective/entropy": 136.988525390625, + "objective/kl": 28.082172393798828, + "objective/non_score_reward": -1.4041086435317993, + "objective/rlhf_reward": -5.616434454917908, + "objective/scores": 0.0, + "policy/approxkl_avg": 85.44422912597656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4560546875, + "step": 417, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0017051696777344 + }, + { + "episode": 6704, + "epoch": 0.04016728379528106, + "loss/policy_avg": 0.5762748718261719, + "lr": 9.732873210633947e-06, + "objective/entropy": 253.0686798095703, + "objective/kl": 40.425331115722656, + "objective/non_score_reward": -2.021266460418701, + "objective/rlhf_reward": -10.085065841674805, + "objective/scores": -0.5, + "policy/approxkl_avg": 64.24309539794922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.783203125, + "step": 418, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996999502182007 + }, + { + "episode": 6720, + "epoch": 0.04026314843441061, + "loss/policy_avg": 0.2653573751449585, + "lr": 9.732234151329244e-06, + "objective/entropy": 185.62173461914062, + "objective/kl": 48.3837890625, + "objective/non_score_reward": -2.419189453125, + "objective/rlhf_reward": -11.6767578125, + "objective/scores": -0.5, + "policy/approxkl_avg": 84.46121215820312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 419, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0015792846679688 + }, + { + "episode": 6736, + "epoch": 0.04035901307354016, + "loss/policy_avg": 0.45824360847473145, + "lr": 9.73159509202454e-06, + "objective/entropy": 189.05142211914062, + "objective/kl": 40.93785858154297, + "objective/non_score_reward": -2.0468931198120117, + "objective/rlhf_reward": -8.187572240829468, + "objective/scores": 0.0, + "policy/approxkl_avg": 365.19354248046875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.607421875, + "step": 420, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960243701934814 + }, + { + "episode": 6752, + "epoch": 0.04045487771266971, + "loss/policy_avg": -0.011572149582207203, + "lr": 9.730956032719838e-06, + "objective/entropy": 157.015625, + "objective/kl": 40.52888488769531, + "objective/non_score_reward": -2.026444435119629, + "objective/rlhf_reward": -8.105777025222778, + "objective/scores": 0.0, + "policy/approxkl_avg": 36.79645538330078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4921875, + "step": 421, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00020170211792 + }, + { + "episode": 6768, + "epoch": 0.04055074235179926, + "loss/policy_avg": 0.09691079705953598, + "lr": 9.730316973415135e-06, + "objective/entropy": 169.43841552734375, + "objective/kl": 38.485565185546875, + "objective/non_score_reward": -1.9242782592773438, + "objective/rlhf_reward": -7.697112798690796, + "objective/scores": 0.0, + "policy/approxkl_avg": 44.24542236328125, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5, + "step": 422, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003914833068848 + }, + { + "episode": 6784, + "epoch": 0.040646606990928806, + "loss/policy_avg": 0.34911060333251953, + "lr": 9.72967791411043e-06, + "objective/entropy": 208.74990844726562, + "objective/kl": 37.376976013183594, + "objective/non_score_reward": -1.8688490390777588, + "objective/rlhf_reward": -6.11614605161993, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 94.55046844482422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.607421875, + "step": 423, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0001683235168457 + }, + { + "episode": 6800, + "epoch": 0.040742471630058355, + "loss/policy_avg": 0.04235881194472313, + "lr": 9.729038854805727e-06, + "objective/entropy": -29.72011947631836, + "objective/kl": 40.81784439086914, + "objective/non_score_reward": -2.0408921241760254, + "objective/rlhf_reward": -6.33874034431846, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 160.66685485839844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.68359375, + "step": 424, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9954640865325928 + }, + { + "episode": 6816, + "epoch": 0.040838336269187904, + "loss/policy_avg": 0.24281972646713257, + "lr": 9.728399795501023e-06, + "objective/entropy": 191.4829559326172, + "objective/kl": 36.464630126953125, + "objective/non_score_reward": -1.8232316970825195, + "objective/rlhf_reward": -5.688806984488087, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 22.607797622680664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.64453125, + "step": 425, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987616539001465 + }, + { + "episode": 6832, + "epoch": 0.04093420090831745, + "loss/policy_avg": 0.28001827001571655, + "lr": 9.72776073619632e-06, + "objective/entropy": 165.8070526123047, + "objective/kl": 41.63663864135742, + "objective/non_score_reward": -2.081831932067871, + "objective/rlhf_reward": -8.327327966690063, + "objective/scores": 0.0, + "policy/approxkl_avg": 62.481040954589844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6953125, + "step": 426, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999871253967285 + }, + { + "episode": 6848, + "epoch": 0.041030065547447, + "loss/policy_avg": 0.2623136341571808, + "lr": 9.727121676891617e-06, + "objective/entropy": 93.63900756835938, + "objective/kl": 23.993732452392578, + "objective/non_score_reward": -1.1996865272521973, + "objective/rlhf_reward": -3.28297432640427, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 19.302370071411133, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.591796875, + "step": 427, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000486373901367 + }, + { + "episode": 6864, + "epoch": 0.04112593018657655, + "loss/policy_avg": 0.2426406443119049, + "lr": 9.726482617586912e-06, + "objective/entropy": 113.51960754394531, + "objective/kl": 37.362518310546875, + "objective/non_score_reward": -1.8681257963180542, + "objective/rlhf_reward": -9.472503662109375, + "objective/scores": -0.5, + "policy/approxkl_avg": 50.487884521484375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.361328125, + "step": 428, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970954656600952 + }, + { + "episode": 6880, + "epoch": 0.0412217948257061, + "loss/policy_avg": 0.8704113960266113, + "lr": 9.72584355828221e-06, + "objective/entropy": 95.84138488769531, + "objective/kl": 36.00267028808594, + "objective/non_score_reward": -1.800133466720581, + "objective/rlhf_reward": -5.37570517805488, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 108.54400634765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65625, + "step": 429, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9939507246017456 + }, + { + "episode": 6896, + "epoch": 0.04131765946483565, + "loss/policy_avg": 0.4814898669719696, + "lr": 9.725204498977506e-06, + "objective/entropy": 225.97140502929688, + "objective/kl": 44.52623748779297, + "objective/non_score_reward": -2.2263121604919434, + "objective/rlhf_reward": -7.348988621440485, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 30.636560440063477, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6640625, + "step": 430, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988288879394531 + }, + { + "episode": 6912, + "epoch": 0.0414135241039652, + "loss/policy_avg": -0.3788025379180908, + "lr": 9.724565439672803e-06, + "objective/entropy": 173.88014221191406, + "objective/kl": 57.236175537109375, + "objective/non_score_reward": -2.8618087768554688, + "objective/rlhf_reward": -13.447235107421875, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.293506622314453, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.552734375, + "step": 431, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000931739807129 + }, + { + "episode": 6928, + "epoch": 0.041509388743094754, + "loss/policy_avg": 0.2732902765274048, + "lr": 9.7239263803681e-06, + "objective/entropy": 109.51008605957031, + "objective/kl": 37.86288833618164, + "objective/non_score_reward": -1.8931443691253662, + "objective/rlhf_reward": -5.449871124998603, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 80.19667053222656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.328125, + "step": 432, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976387023925781 + }, + { + "episode": 6944, + "epoch": 0.0416052533822243, + "loss/policy_avg": 0.27860450744628906, + "lr": 9.723287321063397e-06, + "objective/entropy": 36.931854248046875, + "objective/kl": 32.6243896484375, + "objective/non_score_reward": -1.6312193870544434, + "objective/rlhf_reward": -8.524877548217773, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.86789894104004, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.517578125, + "step": 433, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978294372558594 + }, + { + "episode": 6960, + "epoch": 0.04170111802135385, + "loss/policy_avg": 0.5596253275871277, + "lr": 9.722648261758692e-06, + "objective/entropy": 199.27142333984375, + "objective/kl": 44.00600051879883, + "objective/non_score_reward": -2.2003002166748047, + "objective/rlhf_reward": -7.350602249713287, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 53.96643829345703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.580078125, + "step": 434, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001927614212036 + }, + { + "episode": 6976, + "epoch": 0.0417969826604834, + "loss/policy_avg": 0.7040017247200012, + "lr": 9.722009202453989e-06, + "objective/entropy": 170.60586547851562, + "objective/kl": 41.97274398803711, + "objective/non_score_reward": -2.098637104034424, + "objective/rlhf_reward": -5.470829759479734, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 91.50270080566406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.546875, + "step": 435, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999673843383789 + }, + { + "episode": 6992, + "epoch": 0.04189284729961295, + "loss/policy_avg": 0.3991093337535858, + "lr": 9.721370143149284e-06, + "objective/entropy": 180.92874145507812, + "objective/kl": 38.3790283203125, + "objective/non_score_reward": -1.9189512729644775, + "objective/rlhf_reward": -5.942471877733866, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 3.5480709075927734, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.55078125, + "step": 436, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001030683517456 + }, + { + "episode": 7008, + "epoch": 0.0419887119387425, + "loss/policy_avg": 0.2966272234916687, + "lr": 9.720731083844581e-06, + "objective/entropy": 142.98663330078125, + "objective/kl": 30.8831729888916, + "objective/non_score_reward": -1.5441588163375854, + "objective/rlhf_reward": -1.776635146141052, + "objective/scores": 1.1, + "policy/approxkl_avg": 19.243196487426758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4873046875, + "step": 437, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0003151893615723 + }, + { + "episode": 7024, + "epoch": 0.04208457657787205, + "loss/policy_avg": 0.22927549481391907, + "lr": 9.720092024539878e-06, + "objective/entropy": -2.2561073303222656, + "objective/kl": 46.941993713378906, + "objective/non_score_reward": -2.347099781036377, + "objective/rlhf_reward": -7.784278903071003, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 284.0980224609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.40625, + "step": 438, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9965739250183105 + }, + { + "episode": 7040, + "epoch": 0.042180441217001596, + "loss/policy_avg": -0.37906330823898315, + "lr": 9.719452965235175e-06, + "objective/entropy": 190.4355010986328, + "objective/kl": 46.711570739746094, + "objective/non_score_reward": -2.3355789184570312, + "objective/rlhf_reward": -6.942314958572387, + "objective/scores": 0.6, + "policy/approxkl_avg": 124.1642074584961, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.59765625, + "step": 439, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0039045810699463 + }, + { + "episode": 7056, + "epoch": 0.042276305856131145, + "loss/policy_avg": 0.2566912770271301, + "lr": 9.718813905930472e-06, + "objective/entropy": 125.67393493652344, + "objective/kl": 41.860557556152344, + "objective/non_score_reward": -2.0930278301239014, + "objective/rlhf_reward": -8.372111141681671, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.688385009765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3818359375, + "step": 440, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996503591537476 + }, + { + "episode": 7072, + "epoch": 0.042372170495260694, + "loss/policy_avg": 0.8826879262924194, + "lr": 9.718174846625767e-06, + "objective/entropy": 151.81240844726562, + "objective/kl": 46.84815979003906, + "objective/non_score_reward": -2.342407703399658, + "objective/rlhf_reward": -9.369631052017212, + "objective/scores": 0.0, + "policy/approxkl_avg": 268.04595947265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 441, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988102912902832 + }, + { + "episode": 7088, + "epoch": 0.04246803513439024, + "loss/policy_avg": -0.02791355550289154, + "lr": 9.717535787321064e-06, + "objective/entropy": 51.914703369140625, + "objective/kl": 49.47495651245117, + "objective/non_score_reward": -2.473747968673706, + "objective/rlhf_reward": -11.894991874694824, + "objective/scores": -0.5, + "policy/approxkl_avg": 68.78294372558594, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.552734375, + "step": 442, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.00296950340271 + }, + { + "episode": 7104, + "epoch": 0.04256389977351979, + "loss/policy_avg": 0.2161749303340912, + "lr": 9.71689672801636e-06, + "objective/entropy": 155.6175079345703, + "objective/kl": 33.92812728881836, + "objective/non_score_reward": -1.696406364440918, + "objective/rlhf_reward": -5.229366033282831, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 162.97238159179688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.474609375, + "step": 443, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964041709899902 + }, + { + "episode": 7120, + "epoch": 0.04265976441264934, + "loss/policy_avg": 1.1079907417297363, + "lr": 9.716257668711657e-06, + "objective/entropy": 228.69253540039062, + "objective/kl": 41.69932556152344, + "objective/non_score_reward": -2.0849664211273193, + "objective/rlhf_reward": -10.339865684509277, + "objective/scores": -0.5, + "policy/approxkl_avg": 299.07171630859375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.689453125, + "step": 444, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000746726989746 + }, + { + "episode": 7136, + "epoch": 0.04275562905177889, + "loss/policy_avg": -0.18060393631458282, + "lr": 9.715618609406954e-06, + "objective/entropy": 116.65229034423828, + "objective/kl": 30.7730712890625, + "objective/non_score_reward": -1.5386537313461304, + "objective/rlhf_reward": -4.843933692178131, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 3.5278494358062744, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6953125, + "step": 445, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0028085708618164 + }, + { + "episode": 7152, + "epoch": 0.04285149369090844, + "loss/policy_avg": 0.06034235656261444, + "lr": 9.714979550102251e-06, + "objective/entropy": 83.11370849609375, + "objective/kl": 48.45117950439453, + "objective/non_score_reward": -2.4225587844848633, + "objective/rlhf_reward": -11.690235137939453, + "objective/scores": -0.5, + "policy/approxkl_avg": 19.12924575805664, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.619140625, + "step": 446, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9970791339874268 + }, + { + "episode": 7168, + "epoch": 0.04294735833003799, + "loss/policy_avg": 2.49505352973938, + "lr": 9.714340490797546e-06, + "objective/entropy": -52.68606185913086, + "objective/kl": 40.133392333984375, + "objective/non_score_reward": -2.006669521331787, + "objective/rlhf_reward": -8.026678442955017, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.131301879882812, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.462890625, + "step": 447, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0003855228424072 + }, + { + "episode": 7184, + "epoch": 0.04304322296916754, + "loss/policy_avg": 0.5659292936325073, + "lr": 9.713701431492843e-06, + "objective/entropy": 182.43234252929688, + "objective/kl": 42.25321578979492, + "objective/non_score_reward": -2.1126608848571777, + "objective/rlhf_reward": -10.450643539428711, + "objective/scores": -0.5, + "policy/approxkl_avg": 101.77164459228516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.560546875, + "step": 448, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980123043060303 + }, + { + "episode": 7200, + "epoch": 0.043139087608297086, + "loss/policy_avg": 0.0843241959810257, + "lr": 9.71306237218814e-06, + "objective/entropy": 245.69744873046875, + "objective/kl": 29.222219467163086, + "objective/non_score_reward": -1.4611108303070068, + "objective/rlhf_reward": -5.844443321228027, + "objective/scores": 0.0, + "policy/approxkl_avg": 54.68086242675781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.744140625, + "step": 449, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981093406677246 + }, + { + "episode": 7216, + "epoch": 0.043234952247426635, + "loss/policy_avg": 0.09573544561862946, + "lr": 9.712423312883437e-06, + "objective/entropy": 93.39710998535156, + "objective/kl": 36.68029022216797, + "objective/non_score_reward": -1.8340145349502563, + "objective/rlhf_reward": -5.511229510578226, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 11.1849365234375, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.2919921875, + "step": 450, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0029678344726562 + }, + { + "episode": 7232, + "epoch": 0.043330816886556184, + "loss/policy_avg": 0.7713199853897095, + "lr": 9.711784253578734e-06, + "objective/entropy": 172.16571044921875, + "objective/kl": 45.20444107055664, + "objective/non_score_reward": -2.2602221965789795, + "objective/rlhf_reward": -7.662286260215145, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 202.79624938964844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.546875, + "step": 451, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003104209899902 + }, + { + "episode": 7248, + "epoch": 0.04342668152568573, + "loss/policy_avg": 0.33446595072746277, + "lr": 9.711145194274029e-06, + "objective/entropy": 200.54441833496094, + "objective/kl": 43.61439514160156, + "objective/non_score_reward": -2.1807198524475098, + "objective/rlhf_reward": -8.72287917137146, + "objective/scores": 0.0, + "policy/approxkl_avg": 175.7442626953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.60546875, + "step": 452, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998628854751587 + }, + { + "episode": 7264, + "epoch": 0.04352254616481528, + "loss/policy_avg": 0.38772690296173096, + "lr": 9.710506134969326e-06, + "objective/entropy": 41.2994384765625, + "objective/kl": 44.39094543457031, + "objective/non_score_reward": -2.2195472717285156, + "objective/rlhf_reward": -10.878189086914062, + "objective/scores": -0.5, + "policy/approxkl_avg": 134.41238403320312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.544921875, + "step": 453, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978395700454712 + }, + { + "episode": 7280, + "epoch": 0.04361841080394483, + "loss/policy_avg": 0.6301360130310059, + "lr": 9.709867075664623e-06, + "objective/entropy": 107.89152526855469, + "objective/kl": 38.03111267089844, + "objective/non_score_reward": -1.9015557765960693, + "objective/rlhf_reward": -9.606223106384277, + "objective/scores": -0.5, + "policy/approxkl_avg": 58.21129608154297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.54296875, + "step": 454, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977179765701294 + }, + { + "episode": 7296, + "epoch": 0.04371427544307438, + "loss/policy_avg": -0.06993488222360611, + "lr": 9.70922801635992e-06, + "objective/entropy": 212.8966827392578, + "objective/kl": 27.732505798339844, + "objective/non_score_reward": -1.3866255283355713, + "objective/rlhf_reward": -7.546501636505127, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.7735340595245361, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6328125, + "step": 455, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002204179763794 + }, + { + "episode": 7312, + "epoch": 0.04381014008220393, + "loss/policy_avg": 1.5585349798202515, + "lr": 9.708588957055215e-06, + "objective/entropy": 8.907943725585938, + "objective/kl": 46.3431510925293, + "objective/non_score_reward": -2.317157506942749, + "objective/rlhf_reward": -9.268629848957062, + "objective/scores": 0.0, + "policy/approxkl_avg": 82.92784118652344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4375, + "step": 456, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991109371185303 + }, + { + "episode": 7328, + "epoch": 0.04390600472133348, + "loss/policy_avg": 0.46995729207992554, + "lr": 9.707949897750512e-06, + "objective/entropy": 2.804157257080078, + "objective/kl": 24.88430404663086, + "objective/non_score_reward": -1.2442151308059692, + "objective/rlhf_reward": -2.8541544101395946, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 16.516504287719727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.615234375, + "step": 457, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996424913406372 + }, + { + "episode": 7344, + "epoch": 0.044001869360463026, + "loss/policy_avg": 1.0241265296936035, + "lr": 9.707310838445809e-06, + "objective/entropy": 261.0165100097656, + "objective/kl": 39.613746643066406, + "objective/non_score_reward": -1.9806873798370361, + "objective/rlhf_reward": -7.922749698162079, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.124380111694336, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.810546875, + "step": 458, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987176656723022 + }, + { + "episode": 7360, + "epoch": 0.044097733999592575, + "loss/policy_avg": 0.18850752711296082, + "lr": 9.706671779141105e-06, + "objective/entropy": 123.50984191894531, + "objective/kl": 45.86781692504883, + "objective/non_score_reward": -2.2933907508850098, + "objective/rlhf_reward": -4.773563241958618, + "objective/scores": 1.1, + "policy/approxkl_avg": 22.56751251220703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.513671875, + "step": 459, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000584125518799 + }, + { + "episode": 7376, + "epoch": 0.044193598638722124, + "loss/policy_avg": 0.3746855854988098, + "lr": 9.7060327198364e-06, + "objective/entropy": 146.0401611328125, + "objective/kl": 38.55249786376953, + "objective/non_score_reward": -1.9276249408721924, + "objective/rlhf_reward": -7.71049952507019, + "objective/scores": 0.0, + "policy/approxkl_avg": 211.50054931640625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.732421875, + "step": 460, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9972167015075684 + }, + { + "episode": 7392, + "epoch": 0.04428946327785167, + "loss/policy_avg": 0.5910313725471497, + "lr": 9.705393660531698e-06, + "objective/entropy": 198.9757080078125, + "objective/kl": 37.61479187011719, + "objective/non_score_reward": -1.880739450454712, + "objective/rlhf_reward": -9.522957801818848, + "objective/scores": -0.5, + "policy/approxkl_avg": 204.2825469970703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.619140625, + "step": 461, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974172115325928 + }, + { + "episode": 7408, + "epoch": 0.04438532791698122, + "loss/policy_avg": -0.04315639287233353, + "lr": 9.704754601226994e-06, + "objective/entropy": 156.10818481445312, + "objective/kl": 26.727436065673828, + "objective/non_score_reward": -1.336371660232544, + "objective/rlhf_reward": -7.345486640930176, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.13689422607422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.501953125, + "step": 462, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999086856842041 + }, + { + "episode": 7424, + "epoch": 0.04448119255611077, + "loss/policy_avg": 0.11912831664085388, + "lr": 9.704115541922291e-06, + "objective/entropy": 153.92323303222656, + "objective/kl": 29.754222869873047, + "objective/non_score_reward": -1.4877111911773682, + "objective/rlhf_reward": -7.950844764709473, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.665206909179688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.763671875, + "step": 463, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9983062744140625 + }, + { + "episode": 7440, + "epoch": 0.04457705719524032, + "loss/policy_avg": 0.34447717666625977, + "lr": 9.703476482617588e-06, + "objective/entropy": 20.401763916015625, + "objective/kl": 36.71702575683594, + "objective/non_score_reward": -1.8358510732650757, + "objective/rlhf_reward": -7.343404173851013, + "objective/scores": 0.0, + "policy/approxkl_avg": 50.746131896972656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.505859375, + "step": 464, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996228218078613 + }, + { + "episode": 7456, + "epoch": 0.04467292183436987, + "loss/policy_avg": 0.05298962816596031, + "lr": 9.702837423312883e-06, + "objective/entropy": 107.51678466796875, + "objective/kl": 30.20279312133789, + "objective/non_score_reward": -1.5101397037506104, + "objective/rlhf_reward": -8.040558815002441, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.385878086090088, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.595703125, + "step": 465, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973207712173462 + }, + { + "episode": 7472, + "epoch": 0.04476878647349942, + "loss/policy_avg": 0.47295618057250977, + "lr": 9.70219836400818e-06, + "objective/entropy": 186.91851806640625, + "objective/kl": 39.759700775146484, + "objective/non_score_reward": -1.9879851341247559, + "objective/rlhf_reward": -6.347820553843098, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 6.087001800537109, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.556640625, + "step": 466, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997063875198364 + }, + { + "episode": 7488, + "epoch": 0.04486465111262897, + "loss/policy_avg": 0.3874097764492035, + "lr": 9.701559304703477e-06, + "objective/entropy": 173.0869903564453, + "objective/kl": 43.96525192260742, + "objective/non_score_reward": -2.1982626914978027, + "objective/rlhf_reward": -7.414448239890438, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 85.4808349609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.53515625, + "step": 467, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984729290008545 + }, + { + "episode": 7504, + "epoch": 0.044960515751758516, + "loss/policy_avg": -0.2668102979660034, + "lr": 9.700920245398774e-06, + "objective/entropy": 71.85359191894531, + "objective/kl": 43.253211975097656, + "objective/non_score_reward": -2.162660598754883, + "objective/rlhf_reward": -10.650642395019531, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.98958969116211, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.564453125, + "step": 468, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.006080389022827 + }, + { + "episode": 7520, + "epoch": 0.045056380390888065, + "loss/policy_avg": 0.3569624722003937, + "lr": 9.700281186094071e-06, + "objective/entropy": 189.16136169433594, + "objective/kl": 40.66926956176758, + "objective/non_score_reward": -2.033463478088379, + "objective/rlhf_reward": -8.133854150772095, + "objective/scores": 0.0, + "policy/approxkl_avg": 19.30429458618164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.63671875, + "step": 469, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99971604347229 + }, + { + "episode": 7536, + "epoch": 0.045152245030017614, + "loss/policy_avg": 0.37975427508354187, + "lr": 9.699642126789368e-06, + "objective/entropy": 38.9386100769043, + "objective/kl": 35.93472671508789, + "objective/non_score_reward": -1.796736478805542, + "objective/rlhf_reward": -5.736347894282684, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 93.73966979980469, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.57421875, + "step": 470, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998188853263855 + }, + { + "episode": 7552, + "epoch": 0.04524810966914716, + "loss/policy_avg": 0.2126319706439972, + "lr": 9.699003067484663e-06, + "objective/entropy": 14.602066040039062, + "objective/kl": 43.82414245605469, + "objective/non_score_reward": -2.19120717048645, + "objective/rlhf_reward": -10.764827728271484, + "objective/scores": -0.5, + "policy/approxkl_avg": 75.86665344238281, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4833984375, + "step": 471, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.005751132965088 + }, + { + "episode": 7568, + "epoch": 0.04534397430827671, + "loss/policy_avg": 0.5844190120697021, + "lr": 9.69836400817996e-06, + "objective/entropy": 84.61041259765625, + "objective/kl": 51.00375747680664, + "objective/non_score_reward": -2.5501880645751953, + "objective/rlhf_reward": -12.200752258300781, + "objective/scores": -0.5, + "policy/approxkl_avg": 147.34310913085938, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.44921875, + "step": 472, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0002200603485107 + }, + { + "episode": 7584, + "epoch": 0.04543983894740626, + "loss/policy_avg": -0.19812732934951782, + "lr": 9.697724948875257e-06, + "objective/entropy": 59.431541442871094, + "objective/kl": 42.676483154296875, + "objective/non_score_reward": -2.133824110031128, + "objective/rlhf_reward": -8.53529667854309, + "objective/scores": 0.0, + "policy/approxkl_avg": 25.93294906616211, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5625, + "step": 473, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001767873764038 + }, + { + "episode": 7600, + "epoch": 0.04553570358653581, + "loss/policy_avg": -0.1512744426727295, + "lr": 9.697085889570554e-06, + "objective/entropy": 34.99419403076172, + "objective/kl": 40.56854248046875, + "objective/non_score_reward": -2.0284271240234375, + "objective/rlhf_reward": -8.113708853721619, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.354091644287109, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.509765625, + "step": 474, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0055007934570312 + }, + { + "episode": 7616, + "epoch": 0.04563156822566536, + "loss/policy_avg": 0.5833158493041992, + "lr": 9.69644683026585e-06, + "objective/entropy": 106.2252197265625, + "objective/kl": 37.68205261230469, + "objective/non_score_reward": -1.8841025829315186, + "objective/rlhf_reward": -5.980151384082392, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 22.509538650512695, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.623046875, + "step": 475, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982876777648926 + }, + { + "episode": 7632, + "epoch": 0.04572743286479491, + "loss/policy_avg": 0.01958562433719635, + "lr": 9.695807770961146e-06, + "objective/entropy": 121.47393035888672, + "objective/kl": 48.13959503173828, + "objective/non_score_reward": -2.40697979927063, + "objective/rlhf_reward": -8.11214765289658, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 75.11579895019531, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.44140625, + "step": 476, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999737024307251 + }, + { + "episode": 7648, + "epoch": 0.04582329750392446, + "loss/policy_avg": 0.6820676326751709, + "lr": 9.695168711656443e-06, + "objective/entropy": 115.61346435546875, + "objective/kl": 37.32343292236328, + "objective/non_score_reward": -1.8661715984344482, + "objective/rlhf_reward": -9.464686393737793, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.16072654724121, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 477, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000274419784546 + }, + { + "episode": 7664, + "epoch": 0.045919162143054006, + "loss/policy_avg": 0.40561172366142273, + "lr": 9.694529652351738e-06, + "objective/entropy": 90.63370513916016, + "objective/kl": 37.22323989868164, + "objective/non_score_reward": -1.8611619472503662, + "objective/rlhf_reward": -5.9940495296434015, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 285.416259765625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.662109375, + "step": 478, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9959321022033691 + }, + { + "episode": 7680, + "epoch": 0.046015026782183555, + "loss/policy_avg": 0.17466656863689423, + "lr": 9.693890593047035e-06, + "objective/entropy": 54.67130661010742, + "objective/kl": 41.81562805175781, + "objective/non_score_reward": -2.0907812118530273, + "objective/rlhf_reward": -8.363124966621399, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4145569801330566, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 479, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975910186767578 + }, + { + "episode": 7696, + "epoch": 0.046110891421313104, + "loss/policy_avg": 0.4119042158126831, + "lr": 9.693251533742331e-06, + "objective/entropy": 92.60330963134766, + "objective/kl": 35.919769287109375, + "objective/non_score_reward": -1.7959884405136108, + "objective/rlhf_reward": -5.45062030951182, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 57.694190979003906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.279296875, + "step": 480, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003273487091064 + }, + { + "episode": 7712, + "epoch": 0.04620675606044265, + "loss/policy_avg": 0.06249671056866646, + "lr": 9.692612474437628e-06, + "objective/entropy": 102.40875244140625, + "objective/kl": 40.75782775878906, + "objective/non_score_reward": -2.037891387939453, + "objective/rlhf_reward": -10.151565551757812, + "objective/scores": -0.5, + "policy/approxkl_avg": 141.67388916015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 481, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004329681396484 + }, + { + "episode": 7728, + "epoch": 0.0463026206995722, + "loss/policy_avg": 0.3060109615325928, + "lr": 9.691973415132925e-06, + "objective/entropy": 149.1110076904297, + "objective/kl": 42.07062530517578, + "objective/non_score_reward": -2.1035311222076416, + "objective/rlhf_reward": -10.414125442504883, + "objective/scores": -0.5, + "policy/approxkl_avg": 73.67881774902344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.630859375, + "step": 482, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.995924472808838 + }, + { + "episode": 7744, + "epoch": 0.04639848533870175, + "loss/policy_avg": 0.7175555229187012, + "lr": 9.691334355828222e-06, + "objective/entropy": 2.7473793029785156, + "objective/kl": 40.123050689697266, + "objective/non_score_reward": -2.006152629852295, + "objective/rlhf_reward": -8.024610042572021, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.62218952178955, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.52734375, + "step": 483, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981098175048828 + }, + { + "episode": 7760, + "epoch": 0.0464943499778313, + "loss/policy_avg": 0.19902104139328003, + "lr": 9.690695296523517e-06, + "objective/entropy": 18.798885345458984, + "objective/kl": 38.83830261230469, + "objective/non_score_reward": -1.9419152736663818, + "objective/rlhf_reward": -9.767661094665527, + "objective/scores": -0.5, + "policy/approxkl_avg": 32.263641357421875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.572265625, + "step": 484, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000005006790161 + }, + { + "episode": 7776, + "epoch": 0.04659021461696085, + "loss/policy_avg": 0.11375686526298523, + "lr": 9.690056237218814e-06, + "objective/entropy": 132.22962951660156, + "objective/kl": 34.968589782714844, + "objective/non_score_reward": -1.748429536819458, + "objective/rlhf_reward": -8.993717193603516, + "objective/scores": -0.5, + "policy/approxkl_avg": 121.77729797363281, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.55859375, + "step": 485, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999431848526001 + }, + { + "episode": 7792, + "epoch": 0.0466860792560904, + "loss/policy_avg": 0.47958219051361084, + "lr": 9.689417177914111e-06, + "objective/entropy": 35.028343200683594, + "objective/kl": 35.37997055053711, + "objective/non_score_reward": -1.7689985036849976, + "objective/rlhf_reward": -7.07599413394928, + "objective/scores": 0.0, + "policy/approxkl_avg": 24.928985595703125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.529296875, + "step": 486, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9948756694793701 + }, + { + "episode": 7808, + "epoch": 0.046781943895219946, + "loss/policy_avg": 0.09556100517511368, + "lr": 9.688778118609408e-06, + "objective/entropy": 148.95831298828125, + "objective/kl": 35.879150390625, + "objective/non_score_reward": -1.7939574718475342, + "objective/rlhf_reward": -5.797227957335812, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 71.48307037353516, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4384765625, + "step": 487, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004147529602051 + }, + { + "episode": 7824, + "epoch": 0.046877808534349495, + "loss/policy_avg": 0.5969531536102295, + "lr": 9.688139059304705e-06, + "objective/entropy": 153.7490234375, + "objective/kl": 48.861671447753906, + "objective/non_score_reward": -2.4430835247039795, + "objective/rlhf_reward": -8.372334098815918, + "objective/scores": 0.35, + "policy/approxkl_avg": 9.396366119384766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.501953125, + "step": 488, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984536170959473 + }, + { + "episode": 7840, + "epoch": 0.046973673173479044, + "loss/policy_avg": 0.1210302859544754, + "lr": 9.6875e-06, + "objective/entropy": 49.37278366088867, + "objective/kl": 35.451751708984375, + "objective/non_score_reward": -1.7725876569747925, + "objective/rlhf_reward": -9.090351104736328, + "objective/scores": -0.5, + "policy/approxkl_avg": 88.43473815917969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.544921875, + "step": 489, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997957706451416 + }, + { + "episode": 7856, + "epoch": 0.0470695378126086, + "loss/policy_avg": 0.08776310086250305, + "lr": 9.686860940695297e-06, + "objective/entropy": 47.7550048828125, + "objective/kl": 41.63129425048828, + "objective/non_score_reward": -2.0815649032592773, + "objective/rlhf_reward": -8.32625961303711, + "objective/scores": 0.0, + "policy/approxkl_avg": 38.46742248535156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.39453125, + "step": 490, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998126745223999 + }, + { + "episode": 7872, + "epoch": 0.04716540245173815, + "loss/policy_avg": -0.2399851381778717, + "lr": 9.686221881390594e-06, + "objective/entropy": 16.01573944091797, + "objective/kl": 30.493085861206055, + "objective/non_score_reward": -1.5246543884277344, + "objective/rlhf_reward": -4.674785335262385, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 26.804075241088867, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4716796875, + "step": 491, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998912811279297 + }, + { + "episode": 7888, + "epoch": 0.0472612670908677, + "loss/policy_avg": 0.13263994455337524, + "lr": 9.68558282208589e-06, + "objective/entropy": 140.36886596679688, + "objective/kl": 35.053714752197266, + "objective/non_score_reward": -1.752685546875, + "objective/rlhf_reward": -9.0107421875, + "objective/scores": -0.5, + "policy/approxkl_avg": 41.00921630859375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.701171875, + "step": 492, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989254474639893 + }, + { + "episode": 7904, + "epoch": 0.04735713172999725, + "loss/policy_avg": 0.03523946925997734, + "lr": 9.684943762781188e-06, + "objective/entropy": 160.34219360351562, + "objective/kl": 34.51702880859375, + "objective/non_score_reward": -1.72585129737854, + "objective/rlhf_reward": -8.903406143188477, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.8806684613227844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.595703125, + "step": 493, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0012478828430176 + }, + { + "episode": 7920, + "epoch": 0.047452996369126796, + "loss/policy_avg": 0.911601185798645, + "lr": 9.684304703476484e-06, + "objective/entropy": 2.773548126220703, + "objective/kl": 37.37300109863281, + "objective/non_score_reward": -1.868650197982788, + "objective/rlhf_reward": -9.474600791931152, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.019964218139648, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69140625, + "step": 494, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992573261260986 + }, + { + "episode": 7936, + "epoch": 0.047548861008256345, + "loss/policy_avg": 0.34348466992378235, + "lr": 9.68366564417178e-06, + "objective/entropy": 121.98845672607422, + "objective/kl": 24.834556579589844, + "objective/non_score_reward": -1.2417279481887817, + "objective/rlhf_reward": -0.566911673545837, + "objective/scores": 1.1, + "policy/approxkl_avg": 22.85826301574707, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.572265625, + "step": 495, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001675605773926 + }, + { + "episode": 7952, + "epoch": 0.047644725647385894, + "loss/policy_avg": 0.10158610343933105, + "lr": 9.683026584867076e-06, + "objective/entropy": 135.89080810546875, + "objective/kl": 33.785274505615234, + "objective/non_score_reward": -1.6892638206481934, + "objective/rlhf_reward": -6.757055282592773, + "objective/scores": 0.0, + "policy/approxkl_avg": 60.38434600830078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51953125, + "step": 496, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9961328506469727 + }, + { + "episode": 7968, + "epoch": 0.04774059028651544, + "loss/policy_avg": 0.23412325978279114, + "lr": 9.682387525562373e-06, + "objective/entropy": 125.34407806396484, + "objective/kl": 30.21947479248047, + "objective/non_score_reward": -1.5109736919403076, + "objective/rlhf_reward": -4.7332141306012865, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 55.509063720703125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.66015625, + "step": 497, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988863468170166 + }, + { + "episode": 7984, + "epoch": 0.04783645492564499, + "loss/policy_avg": 0.19422444701194763, + "lr": 9.68174846625767e-06, + "objective/entropy": -99.93417358398438, + "objective/kl": 36.70722579956055, + "objective/non_score_reward": -1.8353612422943115, + "objective/rlhf_reward": -9.341445922851562, + "objective/scores": -0.5, + "policy/approxkl_avg": 62.86932373046875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5234375, + "step": 498, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983272552490234 + }, + { + "episode": 8000, + "epoch": 0.04793231956477454, + "loss/policy_avg": 0.7712575197219849, + "lr": 9.681109406952967e-06, + "objective/entropy": 185.83950805664062, + "objective/kl": 40.12968063354492, + "objective/non_score_reward": -2.006484031677246, + "objective/rlhf_reward": -6.078524629549916, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 107.38670349121094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.56640625, + "step": 499, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978251457214355 + }, + { + "episode": 8016, + "epoch": 0.04802818420390409, + "loss/policy_avg": 0.048789143562316895, + "lr": 9.680470347648262e-06, + "objective/entropy": 80.77729797363281, + "objective/kl": 43.95686340332031, + "objective/non_score_reward": -2.197843313217163, + "objective/rlhf_reward": -7.2756017086827125, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 23.518718719482422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.583984375, + "step": 500, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0010409355163574 + }, + { + "episode": 8032, + "epoch": 0.04812404884303364, + "loss/policy_avg": 1.3080418109893799, + "lr": 9.67983128834356e-06, + "objective/entropy": 25.42633628845215, + "objective/kl": 34.060516357421875, + "objective/non_score_reward": -1.7030255794525146, + "objective/rlhf_reward": -5.207982692782002, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 10.431194305419922, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.556640625, + "step": 501, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000088930130005 + }, + { + "episode": 8048, + "epoch": 0.04821991348216319, + "loss/policy_avg": 0.543501615524292, + "lr": 9.679192229038854e-06, + "objective/entropy": 95.14369201660156, + "objective/kl": 32.53972625732422, + "objective/non_score_reward": -1.6269863843917847, + "objective/rlhf_reward": -8.50794506072998, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.882524490356445, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.765625, + "step": 502, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998069167137146 + }, + { + "episode": 8064, + "epoch": 0.048315778121292736, + "loss/policy_avg": 0.46753692626953125, + "lr": 9.678553169734151e-06, + "objective/entropy": 182.4403533935547, + "objective/kl": 51.98662567138672, + "objective/non_score_reward": -2.5993313789367676, + "objective/rlhf_reward": -12.39732551574707, + "objective/scores": -0.5, + "policy/approxkl_avg": 97.35031127929688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.689453125, + "step": 503, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995888471603394 + }, + { + "episode": 8080, + "epoch": 0.048411642760422285, + "loss/policy_avg": 0.38062262535095215, + "lr": 9.677914110429448e-06, + "objective/entropy": 148.70054626464844, + "objective/kl": 44.446372985839844, + "objective/non_score_reward": -2.222318649291992, + "objective/rlhf_reward": -10.889274597167969, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.08675765991211, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.525390625, + "step": 504, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998737096786499 + }, + { + "episode": 8096, + "epoch": 0.048507507399551834, + "loss/policy_avg": 0.2571317255496979, + "lr": 9.677275051124745e-06, + "objective/entropy": 67.0958251953125, + "objective/kl": 42.69478988647461, + "objective/non_score_reward": -2.134739398956299, + "objective/rlhf_reward": -7.023185694011387, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 69.5447998046875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 505, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.995441198348999 + }, + { + "episode": 8112, + "epoch": 0.04860337203868138, + "loss/policy_avg": 0.26023709774017334, + "lr": 9.676635991820042e-06, + "objective/entropy": 134.39599609375, + "objective/kl": 50.59172821044922, + "objective/non_score_reward": -2.5295865535736084, + "objective/rlhf_reward": -12.118346214294434, + "objective/scores": -0.5, + "policy/approxkl_avg": 117.91007995605469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.697265625, + "step": 506, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979584217071533 + }, + { + "episode": 8128, + "epoch": 0.04869923667781093, + "loss/policy_avg": 0.023366611450910568, + "lr": 9.675996932515339e-06, + "objective/entropy": 6.523872375488281, + "objective/kl": 33.86302185058594, + "objective/non_score_reward": -1.6931511163711548, + "objective/rlhf_reward": -5.394002297011715, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 1.230093240737915, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4658203125, + "step": 507, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001136302947998 + }, + { + "episode": 8144, + "epoch": 0.04879510131694048, + "loss/policy_avg": 0.1962001621723175, + "lr": 9.675357873210634e-06, + "objective/entropy": 101.35081481933594, + "objective/kl": 51.96517562866211, + "objective/non_score_reward": -2.5982584953308105, + "objective/rlhf_reward": -9.033784830306454, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 6.275201797485352, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 508, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995527267456055 + }, + { + "episode": 8160, + "epoch": 0.04889096595607003, + "loss/policy_avg": 0.32110342383384705, + "lr": 9.67471881390593e-06, + "objective/entropy": 21.122821807861328, + "objective/kl": 40.87200927734375, + "objective/non_score_reward": -2.04360032081604, + "objective/rlhf_reward": -3.774401760101318, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.06642150878906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.826171875, + "step": 509, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997953176498413 + }, + { + "episode": 8176, + "epoch": 0.04898683059519958, + "loss/policy_avg": 0.11684095114469528, + "lr": 9.674079754601228e-06, + "objective/entropy": 123.6054916381836, + "objective/kl": 43.744590759277344, + "objective/non_score_reward": -2.1872293949127197, + "objective/rlhf_reward": -4.348917520046234, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.2953643798828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4482421875, + "step": 510, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00039005279541 + }, + { + "episode": 8192, + "epoch": 0.04908269523432913, + "loss/policy_avg": 0.21204860508441925, + "lr": 9.673440695296525e-06, + "objective/entropy": 92.97704315185547, + "objective/kl": 33.71581268310547, + "objective/non_score_reward": -1.6857905387878418, + "objective/rlhf_reward": -8.743162155151367, + "objective/scores": -0.5, + "policy/approxkl_avg": 78.84205627441406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4951171875, + "step": 511, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0001296997070312 + }, + { + "episode": 8208, + "epoch": 0.04917855987345868, + "loss/policy_avg": 0.2907988429069519, + "lr": 9.672801635991821e-06, + "objective/entropy": 142.2095184326172, + "objective/kl": 46.76347351074219, + "objective/non_score_reward": -2.3381738662719727, + "objective/rlhf_reward": -11.35269546508789, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.935170650482178, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 512, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997193813323975 + }, + { + "episode": 8224, + "epoch": 0.049274424512588226, + "loss/policy_avg": -0.0883089154958725, + "lr": 9.672162576687117e-06, + "objective/entropy": 78.84781646728516, + "objective/kl": 37.297393798828125, + "objective/non_score_reward": -1.8648698329925537, + "objective/rlhf_reward": -7.459479093551636, + "objective/scores": 0.0, + "policy/approxkl_avg": 47.29024124145508, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4755859375, + "step": 513, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0175886154174805 + }, + { + "episode": 8240, + "epoch": 0.049370289151717775, + "loss/policy_avg": 1.227190375328064, + "lr": 9.671523517382413e-06, + "objective/entropy": 153.3360595703125, + "objective/kl": 42.09587097167969, + "objective/non_score_reward": -2.1047935485839844, + "objective/rlhf_reward": -10.419174194335938, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.333316802978516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 514, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991519451141357 + }, + { + "episode": 8256, + "epoch": 0.049466153790847324, + "loss/policy_avg": 0.5838215947151184, + "lr": 9.67088445807771e-06, + "objective/entropy": 139.77813720703125, + "objective/kl": 33.785945892333984, + "objective/non_score_reward": -1.6892971992492676, + "objective/rlhf_reward": -4.93236028698356, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 99.73694610595703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4208984375, + "step": 515, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996287822723389 + }, + { + "episode": 8272, + "epoch": 0.04956201842997687, + "loss/policy_avg": 0.059313490986824036, + "lr": 9.670245398773007e-06, + "objective/entropy": 14.463359832763672, + "objective/kl": 34.218177795410156, + "objective/non_score_reward": -1.710909128189087, + "objective/rlhf_reward": -8.843635559082031, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.5083122253418, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.65625, + "step": 516, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997997760772705 + }, + { + "episode": 8288, + "epoch": 0.04965788306910642, + "loss/policy_avg": 0.2258259505033493, + "lr": 9.669606339468304e-06, + "objective/entropy": -95.49360656738281, + "objective/kl": 22.19683837890625, + "objective/non_score_reward": -1.10984206199646, + "objective/rlhf_reward": -4.439367949962616, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.760451316833496, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 517, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9969854354858398 + }, + { + "episode": 8304, + "epoch": 0.04975374770823597, + "loss/policy_avg": 0.10804040729999542, + "lr": 9.668967280163601e-06, + "objective/entropy": 143.58779907226562, + "objective/kl": 35.785003662109375, + "objective/non_score_reward": -1.789250135421753, + "objective/rlhf_reward": -9.157001495361328, + "objective/scores": -0.5, + "policy/approxkl_avg": 47.657981872558594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.794921875, + "step": 518, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9962890148162842 + }, + { + "episode": 8320, + "epoch": 0.04984961234736552, + "loss/policy_avg": 0.009697876870632172, + "lr": 9.668328220858896e-06, + "objective/entropy": 10.224929809570312, + "objective/kl": 37.23715591430664, + "objective/non_score_reward": -1.8618578910827637, + "objective/rlhf_reward": -5.891172378268793, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.469694972038269, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3876953125, + "step": 519, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995241165161133 + }, + { + "episode": 8336, + "epoch": 0.04994547698649507, + "loss/policy_avg": 0.07935798913240433, + "lr": 9.667689161554193e-06, + "objective/entropy": 37.64440155029297, + "objective/kl": 41.3823356628418, + "objective/non_score_reward": -2.0691165924072266, + "objective/rlhf_reward": -10.276466369628906, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.435813903808594, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.546875, + "step": 520, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001011848449707 + }, + { + "episode": 8352, + "epoch": 0.05004134162562462, + "loss/policy_avg": -0.02070830762386322, + "lr": 9.66705010224949e-06, + "objective/entropy": 58.74858474731445, + "objective/kl": 43.55432891845703, + "objective/non_score_reward": -2.1777162551879883, + "objective/rlhf_reward": -7.332263686744076, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 6.468544960021973, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.607421875, + "step": 521, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001483917236328 + }, + { + "episode": 8368, + "epoch": 0.05013720626475417, + "loss/policy_avg": 0.46028798818588257, + "lr": 9.666411042944787e-06, + "objective/entropy": -78.76937866210938, + "objective/kl": 23.173397064208984, + "objective/non_score_reward": -1.1586699485778809, + "objective/rlhf_reward": -6.634679794311523, + "objective/scores": -0.5, + "policy/approxkl_avg": 24.691064834594727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51953125, + "step": 522, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976732730865479 + }, + { + "episode": 8384, + "epoch": 0.050233070903883716, + "loss/policy_avg": 0.18045517802238464, + "lr": 9.665771983640082e-06, + "objective/entropy": 46.4280891418457, + "objective/kl": 40.75489044189453, + "objective/non_score_reward": -2.0377445220947266, + "objective/rlhf_reward": -8.150978326797485, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3056907653808594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44921875, + "step": 523, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977219104766846 + }, + { + "episode": 8400, + "epoch": 0.050328935543013265, + "loss/policy_avg": 0.14005348086357117, + "lr": 9.665132924335379e-06, + "objective/entropy": 30.292451858520508, + "objective/kl": 34.13593292236328, + "objective/non_score_reward": -1.706796646118164, + "objective/rlhf_reward": -8.827186584472656, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.698740005493164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4794921875, + "step": 524, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0000524520874023 + }, + { + "episode": 8416, + "epoch": 0.050424800182142814, + "loss/policy_avg": 0.06975753605365753, + "lr": 9.664493865030676e-06, + "objective/entropy": 83.52384948730469, + "objective/kl": 34.249717712402344, + "objective/non_score_reward": -1.7124860286712646, + "objective/rlhf_reward": -6.849944233894348, + "objective/scores": 0.0, + "policy/approxkl_avg": 24.646453857421875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58203125, + "step": 525, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0034072399139404 + }, + { + "episode": 8432, + "epoch": 0.05052066482127236, + "loss/policy_avg": 0.22392672300338745, + "lr": 9.663854805725971e-06, + "objective/entropy": 64.02466583251953, + "objective/kl": 34.11146545410156, + "objective/non_score_reward": -1.705573320388794, + "objective/rlhf_reward": -8.822293281555176, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.05424118041992, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 526, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9970359802246094 + }, + { + "episode": 8448, + "epoch": 0.05061652946040191, + "loss/policy_avg": 1.0532065629959106, + "lr": 9.663215746421268e-06, + "objective/entropy": 90.72592163085938, + "objective/kl": 41.28602600097656, + "objective/non_score_reward": -2.0643014907836914, + "objective/rlhf_reward": -6.931692812472505, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 31.192419052124023, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4990234375, + "step": 527, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001697540283203 + }, + { + "episode": 8464, + "epoch": 0.05071239409953146, + "loss/policy_avg": 0.0644269585609436, + "lr": 9.662576687116565e-06, + "objective/entropy": 67.44807434082031, + "objective/kl": 36.54154586791992, + "objective/non_score_reward": -1.8270775079727173, + "objective/rlhf_reward": -7.30830979347229, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.966569900512695, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5078125, + "step": 528, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998882532119751 + }, + { + "episode": 8480, + "epoch": 0.05080825873866101, + "loss/policy_avg": -0.2570219039916992, + "lr": 9.661937627811862e-06, + "objective/entropy": -26.96208953857422, + "objective/kl": 34.49934768676758, + "objective/non_score_reward": -1.7249674797058105, + "objective/rlhf_reward": -6.899869918823242, + "objective/scores": 0.0, + "policy/approxkl_avg": 20.390092849731445, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3447265625, + "step": 529, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000749111175537 + }, + { + "episode": 8496, + "epoch": 0.05090412337779056, + "loss/policy_avg": 0.25002214312553406, + "lr": 9.661298568507158e-06, + "objective/entropy": 91.19659423828125, + "objective/kl": 35.412471771240234, + "objective/non_score_reward": -1.7706236839294434, + "objective/rlhf_reward": -9.082494735717773, + "objective/scores": -0.5, + "policy/approxkl_avg": 166.57394409179688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4619140625, + "step": 530, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9965037107467651 + }, + { + "episode": 8512, + "epoch": 0.05099998801692011, + "loss/policy_avg": 0.022846542298793793, + "lr": 9.660659509202455e-06, + "objective/entropy": -37.26931381225586, + "objective/kl": 39.86629867553711, + "objective/non_score_reward": -1.9933149814605713, + "objective/rlhf_reward": -7.973260045051575, + "objective/scores": 0.0, + "policy/approxkl_avg": 56.44502258300781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4677734375, + "step": 531, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974274635314941 + }, + { + "episode": 8528, + "epoch": 0.051095852656049656, + "loss/policy_avg": 0.12022869288921356, + "lr": 9.66002044989775e-06, + "objective/entropy": 67.10712432861328, + "objective/kl": 41.30962371826172, + "objective/non_score_reward": -2.065481185913086, + "objective/rlhf_reward": -8.261924982070923, + "objective/scores": 0.0, + "policy/approxkl_avg": 46.29387664794922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55859375, + "step": 532, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975337982177734 + }, + { + "episode": 8544, + "epoch": 0.051191717295179205, + "loss/policy_avg": 0.6448026895523071, + "lr": 9.659381390593047e-06, + "objective/entropy": 11.027664184570312, + "objective/kl": 40.83232498168945, + "objective/non_score_reward": -2.0416159629821777, + "objective/rlhf_reward": -8.16646432876587, + "objective/scores": 0.0, + "policy/approxkl_avg": 28.078399658203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.60546875, + "step": 533, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000098943710327 + }, + { + "episode": 8560, + "epoch": 0.051287581934308754, + "loss/policy_avg": 0.8582497835159302, + "lr": 9.658742331288344e-06, + "objective/entropy": 113.14666748046875, + "objective/kl": 42.10472106933594, + "objective/non_score_reward": -2.105236053466797, + "objective/rlhf_reward": -8.420944571495056, + "objective/scores": 0.0, + "policy/approxkl_avg": 14.971941947937012, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.548828125, + "step": 534, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971849918365479 + }, + { + "episode": 8576, + "epoch": 0.0513834465734383, + "loss/policy_avg": 0.038903310894966125, + "lr": 9.658103271983641e-06, + "objective/entropy": 143.5253448486328, + "objective/kl": 42.91957092285156, + "objective/non_score_reward": -2.1459789276123047, + "objective/rlhf_reward": -7.18391523361206, + "objective/scores": 0.35, + "policy/approxkl_avg": 17.93328857421875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.796875, + "step": 535, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967997074127197 + }, + { + "episode": 8592, + "epoch": 0.05147931121256785, + "loss/policy_avg": 0.25293684005737305, + "lr": 9.657464212678938e-06, + "objective/entropy": 96.65731811523438, + "objective/kl": 40.929542541503906, + "objective/non_score_reward": -2.0464773178100586, + "objective/rlhf_reward": -3.7859089136123654, + "objective/scores": 1.1, + "policy/approxkl_avg": 86.96218872070312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.521484375, + "step": 536, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9961689710617065 + }, + { + "episode": 8608, + "epoch": 0.0515751758516974, + "loss/policy_avg": 0.3026430606842041, + "lr": 9.656825153374235e-06, + "objective/entropy": 28.206890106201172, + "objective/kl": 32.628379821777344, + "objective/non_score_reward": -1.6314189434051514, + "objective/rlhf_reward": -6.525675892829895, + "objective/scores": 0.0, + "policy/approxkl_avg": 24.484819412231445, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.529296875, + "step": 537, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99712336063385 + }, + { + "episode": 8624, + "epoch": 0.05167104049082695, + "loss/policy_avg": -0.16653533279895782, + "lr": 9.65618609406953e-06, + "objective/entropy": -77.3916015625, + "objective/kl": 30.037582397460938, + "objective/non_score_reward": -1.5018792152404785, + "objective/rlhf_reward": -6.007516622543335, + "objective/scores": 0.0, + "policy/approxkl_avg": 87.15878295898438, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6015625, + "step": 538, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.007481575012207 + }, + { + "episode": 8640, + "epoch": 0.0517669051299565, + "loss/policy_avg": 0.11232887953519821, + "lr": 9.655547034764827e-06, + "objective/entropy": 55.08220291137695, + "objective/kl": 43.097896575927734, + "objective/non_score_reward": -2.1548948287963867, + "objective/rlhf_reward": -8.6195787191391, + "objective/scores": 0.0, + "policy/approxkl_avg": 26.015697479248047, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.60546875, + "step": 539, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0278120040893555 + }, + { + "episode": 8656, + "epoch": 0.05186276976908605, + "loss/policy_avg": 0.6124523878097534, + "lr": 9.654907975460124e-06, + "objective/entropy": 34.89768600463867, + "objective/kl": 46.468101501464844, + "objective/non_score_reward": -2.3234052658081055, + "objective/rlhf_reward": -9.293620705604553, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.492743492126465, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3681640625, + "step": 540, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999890923500061 + }, + { + "episode": 8672, + "epoch": 0.0519586344082156, + "loss/policy_avg": 0.2736782133579254, + "lr": 9.65426891615542e-06, + "objective/entropy": 39.75823211669922, + "objective/kl": 30.90910530090332, + "objective/non_score_reward": -1.5454552173614502, + "objective/rlhf_reward": -4.666049086841282, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 31.34353256225586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4521484375, + "step": 541, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000300884246826 + }, + { + "episode": 8688, + "epoch": 0.052054499047345146, + "loss/policy_avg": 0.4218829870223999, + "lr": 9.653629856850718e-06, + "objective/entropy": 208.8717041015625, + "objective/kl": 39.65779495239258, + "objective/non_score_reward": -1.9828898906707764, + "objective/rlhf_reward": -6.5529573942102015, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 68.46629333496094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.662109375, + "step": 542, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981236457824707 + }, + { + "episode": 8704, + "epoch": 0.052150363686474695, + "loss/policy_avg": 0.45531219244003296, + "lr": 9.652990797546013e-06, + "objective/entropy": 209.2900390625, + "objective/kl": 33.24062728881836, + "objective/non_score_reward": -1.6620312929153442, + "objective/rlhf_reward": -5.248125171661377, + "objective/scores": 0.35, + "policy/approxkl_avg": 40.11018371582031, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.658203125, + "step": 543, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973430633544922 + }, + { + "episode": 8720, + "epoch": 0.052246228325604244, + "loss/policy_avg": 0.21141277253627777, + "lr": 9.65235173824131e-06, + "objective/entropy": 158.35287475585938, + "objective/kl": 32.80064392089844, + "objective/non_score_reward": -1.6400320529937744, + "objective/rlhf_reward": -8.560128211975098, + "objective/scores": -0.5, + "policy/approxkl_avg": 55.47556686401367, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 544, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9978859424591064 + }, + { + "episode": 8736, + "epoch": 0.05234209296473379, + "loss/policy_avg": 0.4950694143772125, + "lr": 9.651712678936605e-06, + "objective/entropy": 92.91190338134766, + "objective/kl": 28.654132843017578, + "objective/non_score_reward": -1.4327068328857422, + "objective/rlhf_reward": -3.905998165878366, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 13.473176956176758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.677734375, + "step": 545, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9976662397384644 + }, + { + "episode": 8752, + "epoch": 0.05243795760386334, + "loss/policy_avg": 0.1986934244632721, + "lr": 9.651073619631902e-06, + "objective/entropy": 72.54715728759766, + "objective/kl": 33.77407455444336, + "objective/non_score_reward": -1.6887036561965942, + "objective/rlhf_reward": -6.754814624786377, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.790899276733398, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.619140625, + "step": 546, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987945556640625 + }, + { + "episode": 8768, + "epoch": 0.0525338222429929, + "loss/policy_avg": 0.17602220177650452, + "lr": 9.650434560327199e-06, + "objective/entropy": 19.197650909423828, + "objective/kl": 42.189781188964844, + "objective/non_score_reward": -2.1094889640808105, + "objective/rlhf_reward": -5.514237318874571, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 83.59529113769531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.478515625, + "step": 547, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978179931640625 + }, + { + "episode": 8784, + "epoch": 0.052629686882122446, + "loss/policy_avg": 0.045998621731996536, + "lr": 9.649795501022496e-06, + "objective/entropy": 158.06063842773438, + "objective/kl": 40.811397552490234, + "objective/non_score_reward": -2.04056978225708, + "objective/rlhf_reward": -10.16227912902832, + "objective/scores": -0.5, + "policy/approxkl_avg": 111.83248901367188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4892578125, + "step": 548, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980087280273438 + }, + { + "episode": 8800, + "epoch": 0.052725551521251995, + "loss/policy_avg": 0.30770862102508545, + "lr": 9.649156441717792e-06, + "objective/entropy": 11.81429672241211, + "objective/kl": 30.653812408447266, + "objective/non_score_reward": -1.5326905250549316, + "objective/rlhf_reward": -4.52664235598238, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 12.983512878417969, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.4384765625, + "step": 549, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991545677185059 + }, + { + "episode": 8816, + "epoch": 0.052821416160381544, + "loss/policy_avg": 0.14274156093597412, + "lr": 9.64851738241309e-06, + "objective/entropy": 103.85163116455078, + "objective/kl": 43.19879913330078, + "objective/non_score_reward": -2.159940004348755, + "objective/rlhf_reward": -6.517053785101448, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 100.42656707763672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6875, + "step": 550, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9965872764587402 + }, + { + "episode": 8832, + "epoch": 0.05291728079951109, + "loss/policy_avg": 0.06003594398498535, + "lr": 9.647878323108384e-06, + "objective/entropy": 179.23623657226562, + "objective/kl": 40.46935272216797, + "objective/non_score_reward": -2.023467779159546, + "objective/rlhf_reward": -6.752235582380919, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 20.83936882019043, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5625, + "step": 551, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001079797744751 + }, + { + "episode": 8848, + "epoch": 0.05301314543864064, + "loss/policy_avg": 0.5863113403320312, + "lr": 9.647239263803681e-06, + "objective/entropy": 76.55807495117188, + "objective/kl": 36.17455291748047, + "objective/non_score_reward": -1.8087276220321655, + "objective/rlhf_reward": -9.23491096496582, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.267086029052734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3798828125, + "step": 552, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001115322113037 + }, + { + "episode": 8864, + "epoch": 0.05310901007777019, + "loss/policy_avg": 2.5198092460632324, + "lr": 9.646600204498978e-06, + "objective/entropy": 31.032962799072266, + "objective/kl": 36.13847351074219, + "objective/non_score_reward": -1.8069238662719727, + "objective/rlhf_reward": -9.22769546508789, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.862698554992676, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.568359375, + "step": 553, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0627989768981934 + }, + { + "episode": 8880, + "epoch": 0.05320487471689974, + "loss/policy_avg": 0.135384663939476, + "lr": 9.645961145194275e-06, + "objective/entropy": 39.74359893798828, + "objective/kl": 45.49197006225586, + "objective/non_score_reward": -2.2745985984802246, + "objective/rlhf_reward": -11.098394393920898, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.076536178588867, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5703125, + "step": 554, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0400075912475586 + }, + { + "episode": 8896, + "epoch": 0.05330073935602929, + "loss/policy_avg": 0.17068403959274292, + "lr": 9.645322085889572e-06, + "objective/entropy": 131.28785705566406, + "objective/kl": 41.12070846557617, + "objective/non_score_reward": -2.056035280227661, + "objective/rlhf_reward": -6.913460275134444, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 19.08953094482422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.396484375, + "step": 555, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997930526733398 + }, + { + "episode": 8912, + "epoch": 0.05339660399515884, + "loss/policy_avg": 0.006861642003059387, + "lr": 9.644683026584867e-06, + "objective/entropy": 71.55320739746094, + "objective/kl": 31.577491760253906, + "objective/non_score_reward": -1.5788745880126953, + "objective/rlhf_reward": -4.75923916598852, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 11.279380798339844, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.58984375, + "step": 556, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0002732276916504 + }, + { + "episode": 8928, + "epoch": 0.05349246863428839, + "loss/policy_avg": 0.3932980000972748, + "lr": 9.644043967280164e-06, + "objective/entropy": 173.74667358398438, + "objective/kl": 39.91008758544922, + "objective/non_score_reward": -1.9955044984817505, + "objective/rlhf_reward": -6.157189126285623, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 75.61815643310547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5390625, + "step": 557, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991559982299805 + }, + { + "episode": 8944, + "epoch": 0.053588333273417936, + "loss/policy_avg": 0.41776636242866516, + "lr": 9.643404907975461e-06, + "objective/entropy": 101.93467712402344, + "objective/kl": 33.950069427490234, + "objective/non_score_reward": -1.6975035667419434, + "objective/rlhf_reward": -8.790014266967773, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.91536021232605, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7265625, + "step": 558, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994499683380127 + }, + { + "episode": 8960, + "epoch": 0.053684197912547485, + "loss/policy_avg": 3.235675811767578, + "lr": 9.642765848670758e-06, + "objective/entropy": 157.3035888671875, + "objective/kl": 48.83702087402344, + "objective/non_score_reward": -2.4418513774871826, + "objective/rlhf_reward": -11.76740550994873, + "objective/scores": -0.5, + "policy/approxkl_avg": 152.27467346191406, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.474609375, + "step": 559, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0070905685424805 + }, + { + "episode": 8976, + "epoch": 0.053780062551677034, + "loss/policy_avg": 0.4991706907749176, + "lr": 9.642126789366055e-06, + "objective/entropy": 163.01913452148438, + "objective/kl": 44.97026062011719, + "objective/non_score_reward": -2.2485132217407227, + "objective/rlhf_reward": -6.5940522909164425, + "objective/scores": 0.6, + "policy/approxkl_avg": 54.085365295410156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.58203125, + "step": 560, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990899562835693 + }, + { + "episode": 8992, + "epoch": 0.05387592719080658, + "loss/policy_avg": 0.35669660568237305, + "lr": 9.641487730061352e-06, + "objective/entropy": 46.909828186035156, + "objective/kl": 31.477493286132812, + "objective/non_score_reward": -1.573874592781067, + "objective/rlhf_reward": -4.953862717657714, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 65.30628967285156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.533203125, + "step": 561, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979372024536133 + }, + { + "episode": 9008, + "epoch": 0.05397179182993613, + "loss/policy_avg": 1.0564548969268799, + "lr": 9.640848670756647e-06, + "objective/entropy": 144.66534423828125, + "objective/kl": 50.382301330566406, + "objective/non_score_reward": -2.5191149711608887, + "objective/rlhf_reward": -8.560688340457615, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 48.65314483642578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.748046875, + "step": 562, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982779026031494 + }, + { + "episode": 9024, + "epoch": 0.05406765646906568, + "loss/policy_avg": 0.11716368794441223, + "lr": 9.640209611451944e-06, + "objective/entropy": 98.48894500732422, + "objective/kl": 28.6131649017334, + "objective/non_score_reward": -1.4306584596633911, + "objective/rlhf_reward": -7.7226338386535645, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.523138999938965, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4130859375, + "step": 563, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0049519538879395 + }, + { + "episode": 9040, + "epoch": 0.05416352110819523, + "loss/policy_avg": 0.7250778675079346, + "lr": 9.63957055214724e-06, + "objective/entropy": 133.11463928222656, + "objective/kl": 37.375694274902344, + "objective/non_score_reward": -1.868784785270691, + "objective/rlhf_reward": -5.075139141082763, + "objective/scores": 0.6, + "policy/approxkl_avg": 13.915103912353516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41796875, + "step": 564, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986052513122559 + }, + { + "episode": 9056, + "epoch": 0.05425938574732478, + "loss/policy_avg": 0.7018356323242188, + "lr": 9.638931492842537e-06, + "objective/entropy": 89.61590576171875, + "objective/kl": 41.851219177246094, + "objective/non_score_reward": -2.0925612449645996, + "objective/rlhf_reward": -6.919646601291046, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 24.61191177368164, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.54296875, + "step": 565, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996438026428223 + }, + { + "episode": 9072, + "epoch": 0.05435525038645433, + "loss/policy_avg": 0.07808268815279007, + "lr": 9.638292433537834e-06, + "objective/entropy": 67.98313903808594, + "objective/kl": 45.94068145751953, + "objective/non_score_reward": -2.29703426361084, + "objective/rlhf_reward": -9.188136219978333, + "objective/scores": 0.0, + "policy/approxkl_avg": 38.387054443359375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.57421875, + "step": 566, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997014999389648 + }, + { + "episode": 9088, + "epoch": 0.05445111502558388, + "loss/policy_avg": 0.0561227947473526, + "lr": 9.63765337423313e-06, + "objective/entropy": -102.10490417480469, + "objective/kl": 32.92976379394531, + "objective/non_score_reward": -1.6464881896972656, + "objective/rlhf_reward": -6.585952877998352, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.602782249450684, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.54296875, + "step": 567, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004687309265137 + }, + { + "episode": 9104, + "epoch": 0.054546979664713426, + "loss/policy_avg": 0.5744443535804749, + "lr": 9.637014314928426e-06, + "objective/entropy": 78.12531280517578, + "objective/kl": 36.76774597167969, + "objective/non_score_reward": -1.8383872509002686, + "objective/rlhf_reward": -9.353549003601074, + "objective/scores": -0.5, + "policy/approxkl_avg": 55.96208572387695, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.51171875, + "step": 568, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991743564605713 + }, + { + "episode": 9120, + "epoch": 0.054642844303842975, + "loss/policy_avg": 0.564866304397583, + "lr": 9.636375255623721e-06, + "objective/entropy": 135.82896423339844, + "objective/kl": 43.04612731933594, + "objective/non_score_reward": -2.15230655670166, + "objective/rlhf_reward": -10.60922622680664, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.51314163208008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4541015625, + "step": 569, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000321388244629 + }, + { + "episode": 9136, + "epoch": 0.054738708942972523, + "loss/policy_avg": 0.16940301656723022, + "lr": 9.635736196319018e-06, + "objective/entropy": -114.63182830810547, + "objective/kl": 36.86830139160156, + "objective/non_score_reward": -1.8434150218963623, + "objective/rlhf_reward": -9.373659133911133, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.11627960205078, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.67578125, + "step": 570, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971542358398438 + }, + { + "episode": 9152, + "epoch": 0.05483457358210207, + "loss/policy_avg": 0.020508363842964172, + "lr": 9.635097137014315e-06, + "objective/entropy": 87.90492248535156, + "objective/kl": 46.06084442138672, + "objective/non_score_reward": -2.303041934967041, + "objective/rlhf_reward": -9.212168216705322, + "objective/scores": 0.0, + "policy/approxkl_avg": 56.395973205566406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.482421875, + "step": 571, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000852584838867 + }, + { + "episode": 9168, + "epoch": 0.05493043822123162, + "loss/policy_avg": -0.007636541500687599, + "lr": 9.634458077709612e-06, + "objective/entropy": -84.44242858886719, + "objective/kl": 36.771697998046875, + "objective/non_score_reward": -1.8385847806930542, + "objective/rlhf_reward": -5.995089256499691, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 0.3396506905555725, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.52734375, + "step": 572, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0062503814697266 + }, + { + "episode": 9184, + "epoch": 0.05502630286036117, + "loss/policy_avg": -0.1648610532283783, + "lr": 9.633819018404909e-06, + "objective/entropy": -70.38121795654297, + "objective/kl": 37.25817108154297, + "objective/non_score_reward": -1.8629084825515747, + "objective/rlhf_reward": -9.45163345336914, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.33013916015625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.455078125, + "step": 573, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978995323181152 + }, + { + "episode": 9200, + "epoch": 0.05512216749949072, + "loss/policy_avg": 0.21912901103496552, + "lr": 9.633179959100206e-06, + "objective/entropy": 65.07223510742188, + "objective/kl": 37.938026428222656, + "objective/non_score_reward": -1.8969011306762695, + "objective/rlhf_reward": -5.854271189371744, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 179.24462890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.634765625, + "step": 574, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971550703048706 + }, + { + "episode": 9216, + "epoch": 0.05521803213862027, + "loss/policy_avg": 0.42310407757759094, + "lr": 9.632540899795501e-06, + "objective/entropy": -74.82484436035156, + "objective/kl": 36.85780334472656, + "objective/non_score_reward": -1.8428901433944702, + "objective/rlhf_reward": -5.54673218277366, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 7.413464546203613, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.53515625, + "step": 575, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000021457672119 + }, + { + "episode": 9232, + "epoch": 0.05531389677774982, + "loss/policy_avg": 0.1522914469242096, + "lr": 9.631901840490798e-06, + "objective/entropy": -78.68695068359375, + "objective/kl": 36.93750762939453, + "objective/non_score_reward": -1.8468754291534424, + "objective/rlhf_reward": -7.387501657009125, + "objective/scores": 0.0, + "policy/approxkl_avg": 299.15435791015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 576, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0027832984924316 + }, + { + "episode": 9248, + "epoch": 0.055409761416879366, + "loss/policy_avg": 0.09173239022493362, + "lr": 9.631262781186095e-06, + "objective/entropy": 206.1100616455078, + "objective/kl": 36.94264221191406, + "objective/non_score_reward": -1.8471322059631348, + "objective/rlhf_reward": -6.009926774588925, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 51.173519134521484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.623046875, + "step": 577, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9956302642822266 + }, + { + "episode": 9264, + "epoch": 0.055505626056008915, + "loss/policy_avg": 0.8109121322631836, + "lr": 9.630623721881392e-06, + "objective/entropy": -66.37505340576172, + "objective/kl": 42.11962127685547, + "objective/non_score_reward": -2.1059811115264893, + "objective/rlhf_reward": -7.023924565315246, + "objective/scores": 0.35, + "policy/approxkl_avg": 154.45883178710938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.568359375, + "step": 578, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999234676361084 + }, + { + "episode": 9280, + "epoch": 0.055601490695138464, + "loss/policy_avg": 0.20485125482082367, + "lr": 9.629984662576689e-06, + "objective/entropy": 172.38034057617188, + "objective/kl": 52.39830780029297, + "objective/non_score_reward": -2.61991548538208, + "objective/rlhf_reward": -9.079662299156189, + "objective/scores": 0.35, + "policy/approxkl_avg": 22.601041793823242, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.50390625, + "step": 579, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0032167434692383 + }, + { + "episode": 9296, + "epoch": 0.05569735533426801, + "loss/policy_avg": 0.7395042181015015, + "lr": 9.629345603271984e-06, + "objective/entropy": 7.457405090332031, + "objective/kl": 45.84647750854492, + "objective/non_score_reward": -2.2923238277435303, + "objective/rlhf_reward": -9.169295310974121, + "objective/scores": 0.0, + "policy/approxkl_avg": 124.10131072998047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.54296875, + "step": 580, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992761611938477 + }, + { + "episode": 9312, + "epoch": 0.05579321997339756, + "loss/policy_avg": 0.23498813807964325, + "lr": 9.62870654396728e-06, + "objective/entropy": -11.64200210571289, + "objective/kl": 43.51708984375, + "objective/non_score_reward": -2.175854444503784, + "objective/rlhf_reward": -4.303417301177978, + "objective/scores": 1.1, + "policy/approxkl_avg": 15.8449068069458, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.587890625, + "step": 581, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007996559143066 + }, + { + "episode": 9328, + "epoch": 0.05588908461252711, + "loss/policy_avg": -0.26322293281555176, + "lr": 9.628067484662578e-06, + "objective/entropy": 14.886768341064453, + "objective/kl": 48.95528030395508, + "objective/non_score_reward": -2.4477639198303223, + "objective/rlhf_reward": -8.431805813048763, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 2.678018569946289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.43359375, + "step": 582, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0002763271331787 + }, + { + "episode": 9344, + "epoch": 0.05598494925165666, + "loss/policy_avg": 0.37311238050460815, + "lr": 9.627428425357874e-06, + "objective/entropy": -29.544679641723633, + "objective/kl": 44.6279411315918, + "objective/non_score_reward": -2.2313971519470215, + "objective/rlhf_reward": -7.474990586848602, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 120.46185302734375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4990234375, + "step": 583, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997020959854126 + }, + { + "episode": 9360, + "epoch": 0.05608081389078621, + "loss/policy_avg": 0.3765791654586792, + "lr": 9.626789366053171e-06, + "objective/entropy": 27.12002182006836, + "objective/kl": 39.4703369140625, + "objective/non_score_reward": -1.9735169410705566, + "objective/rlhf_reward": -9.894067764282227, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.805877208709717, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.529296875, + "step": 584, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993363618850708 + }, + { + "episode": 9376, + "epoch": 0.05617667852991576, + "loss/policy_avg": 0.03627479076385498, + "lr": 9.626150306748468e-06, + "objective/entropy": 15.946697235107422, + "objective/kl": 54.321632385253906, + "objective/non_score_reward": -2.7160816192626953, + "objective/rlhf_reward": -12.864326477050781, + "objective/scores": -0.5, + "policy/approxkl_avg": 170.80502319335938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5078125, + "step": 585, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0002691745758057 + }, + { + "episode": 9392, + "epoch": 0.05627254316904531, + "loss/policy_avg": -0.02700839936733246, + "lr": 9.625511247443763e-06, + "objective/entropy": 86.60926055908203, + "objective/kl": 35.19524383544922, + "objective/non_score_reward": -1.7597622871398926, + "objective/rlhf_reward": -9.03904914855957, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.260379791259766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.755859375, + "step": 586, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0021920204162598 + }, + { + "episode": 9408, + "epoch": 0.056368407808174856, + "loss/policy_avg": 0.1763666421175003, + "lr": 9.62487218813906e-06, + "objective/entropy": 144.70706176757812, + "objective/kl": 23.635494232177734, + "objective/non_score_reward": -1.181774616241455, + "objective/rlhf_reward": -6.72709846496582, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.851166725158691, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6796875, + "step": 587, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997590780258179 + }, + { + "episode": 9424, + "epoch": 0.056464272447304405, + "loss/policy_avg": 0.527219295501709, + "lr": 9.624233128834357e-06, + "objective/entropy": 173.60789489746094, + "objective/kl": 42.092262268066406, + "objective/non_score_reward": -2.1046133041381836, + "objective/rlhf_reward": -8.418453335762024, + "objective/scores": 0.0, + "policy/approxkl_avg": 93.84263610839844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5078125, + "step": 588, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996020793914795 + }, + { + "episode": 9440, + "epoch": 0.056560137086433954, + "loss/policy_avg": 0.16997206211090088, + "lr": 9.623594069529654e-06, + "objective/entropy": 100.782470703125, + "objective/kl": 33.46315002441406, + "objective/non_score_reward": -1.6731575727462769, + "objective/rlhf_reward": -8.692630767822266, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.64217185974121, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 589, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9955521821975708 + }, + { + "episode": 9456, + "epoch": 0.0566560017255635, + "loss/policy_avg": 0.43916282057762146, + "lr": 9.62295501022495e-06, + "objective/entropy": -60.59128189086914, + "objective/kl": 42.55094909667969, + "objective/non_score_reward": -2.1275475025177, + "objective/rlhf_reward": -10.5101900100708, + "objective/scores": -0.5, + "policy/approxkl_avg": 162.38119506835938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6796875, + "step": 590, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992976188659668 + }, + { + "episode": 9472, + "epoch": 0.05675186636469305, + "loss/policy_avg": 0.412034809589386, + "lr": 9.622315950920246e-06, + "objective/entropy": 55.43243408203125, + "objective/kl": 33.46851348876953, + "objective/non_score_reward": -1.6734256744384766, + "objective/rlhf_reward": -8.693702697753906, + "objective/scores": -0.5, + "policy/approxkl_avg": 149.49728393554688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.595703125, + "step": 591, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977531433105469 + }, + { + "episode": 9488, + "epoch": 0.0568477310038226, + "loss/policy_avg": 0.28347572684288025, + "lr": 9.621676891615543e-06, + "objective/entropy": 35.55633544921875, + "objective/kl": 40.687564849853516, + "objective/non_score_reward": -2.0343782901763916, + "objective/rlhf_reward": -5.213794265629026, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 117.67996215820312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.591796875, + "step": 592, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995044469833374 + }, + { + "episode": 9504, + "epoch": 0.05694359564295215, + "loss/policy_avg": 0.7297570705413818, + "lr": 9.621037832310838e-06, + "objective/entropy": -104.14797973632812, + "objective/kl": 34.44893264770508, + "objective/non_score_reward": -1.7224466800689697, + "objective/rlhf_reward": -6.889786720275879, + "objective/scores": 0.0, + "policy/approxkl_avg": 22.624303817749023, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4912109375, + "step": 593, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997528314590454 + }, + { + "episode": 9520, + "epoch": 0.0570394602820817, + "loss/policy_avg": 0.0011496543884277344, + "lr": 9.620398773006135e-06, + "objective/entropy": 224.37185668945312, + "objective/kl": 27.430057525634766, + "objective/non_score_reward": -1.3715028762817383, + "objective/rlhf_reward": -3.5386005146073654, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 5.8247833251953125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.71875, + "step": 594, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000392436981201 + }, + { + "episode": 9536, + "epoch": 0.05713532492121125, + "loss/policy_avg": 0.23965345323085785, + "lr": 9.619759713701432e-06, + "objective/entropy": 25.137657165527344, + "objective/kl": 37.088069915771484, + "objective/non_score_reward": -1.8544034957885742, + "objective/rlhf_reward": -7.4176143407821655, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.948873519897461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.44921875, + "step": 595, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978184700012207 + }, + { + "episode": 9552, + "epoch": 0.057231189560340796, + "loss/policy_avg": 0.5549752712249756, + "lr": 9.619120654396729e-06, + "objective/entropy": -2.4383678436279297, + "objective/kl": 42.59381103515625, + "objective/non_score_reward": -2.129690408706665, + "objective/rlhf_reward": -10.51876163482666, + "objective/scores": -0.5, + "policy/approxkl_avg": 311.0433349609375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6015625, + "step": 596, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999363899230957 + }, + { + "episode": 9568, + "epoch": 0.057327054199470345, + "loss/policy_avg": 0.4941880702972412, + "lr": 9.618481595092026e-06, + "objective/entropy": 94.06913757324219, + "objective/kl": 44.376983642578125, + "objective/non_score_reward": -2.2188491821289062, + "objective/rlhf_reward": -10.875396728515625, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.64936065673828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3056640625, + "step": 597, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996791958808899 + }, + { + "episode": 9584, + "epoch": 0.057422918838599894, + "loss/policy_avg": 0.269217848777771, + "lr": 9.617842535787323e-06, + "objective/entropy": 140.2623291015625, + "objective/kl": 48.70625305175781, + "objective/non_score_reward": -2.4353127479553223, + "objective/rlhf_reward": -9.74125051498413, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.9949951171875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.435546875, + "step": 598, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980602264404297 + }, + { + "episode": 9600, + "epoch": 0.05751878347772944, + "loss/policy_avg": 8.679291725158691, + "lr": 9.617203476482618e-06, + "objective/entropy": 132.02699279785156, + "objective/kl": 40.58224868774414, + "objective/non_score_reward": -2.0291123390197754, + "objective/rlhf_reward": -10.116449356079102, + "objective/scores": -0.5, + "policy/approxkl_avg": 24.197277069091797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71875, + "step": 599, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996961236000061 + }, + { + "episode": 9616, + "epoch": 0.05761464811685899, + "loss/policy_avg": 0.48320621252059937, + "lr": 9.616564417177915e-06, + "objective/entropy": -89.43022155761719, + "objective/kl": 48.188594818115234, + "objective/non_score_reward": -2.4094297885894775, + "objective/rlhf_reward": -11.63771915435791, + "objective/scores": -0.5, + "policy/approxkl_avg": 58.32026672363281, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.51171875, + "step": 600, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0011630058288574 + }, + { + "episode": 9632, + "epoch": 0.05771051275598854, + "loss/policy_avg": 0.28722313046455383, + "lr": 9.615925357873211e-06, + "objective/entropy": 168.44546508789062, + "objective/kl": 35.379539489746094, + "objective/non_score_reward": -1.7689769268035889, + "objective/rlhf_reward": -5.697305657950741, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 53.8581657409668, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 601, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994373321533203 + }, + { + "episode": 9648, + "epoch": 0.05780637739511809, + "loss/policy_avg": 4.71852445602417, + "lr": 9.615286298568508e-06, + "objective/entropy": 96.727294921875, + "objective/kl": 45.9981689453125, + "objective/non_score_reward": -2.2999086380004883, + "objective/rlhf_reward": -11.199634552001953, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.669649124145508, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5390625, + "step": 602, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000046730041504 + }, + { + "episode": 9664, + "epoch": 0.05790224203424764, + "loss/policy_avg": -0.060234591364860535, + "lr": 9.614647239263805e-06, + "objective/entropy": 9.793865203857422, + "objective/kl": 26.792417526245117, + "objective/non_score_reward": -1.3396209478378296, + "objective/rlhf_reward": -3.9798818016923487, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 15.58694076538086, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.345703125, + "step": 603, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0112948417663574 + }, + { + "episode": 9680, + "epoch": 0.05799810667337719, + "loss/policy_avg": 0.44011878967285156, + "lr": 9.6140081799591e-06, + "objective/entropy": -27.098220825195312, + "objective/kl": 40.550758361816406, + "objective/non_score_reward": -2.0275378227233887, + "objective/rlhf_reward": -8.110151290893555, + "objective/scores": 0.0, + "policy/approxkl_avg": 112.79510498046875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.611328125, + "step": 604, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973044395446777 + }, + { + "episode": 9696, + "epoch": 0.058093971312506744, + "loss/policy_avg": 0.1456628441810608, + "lr": 9.613369120654397e-06, + "objective/entropy": -26.12735366821289, + "objective/kl": 35.541786193847656, + "objective/non_score_reward": -1.777089238166809, + "objective/rlhf_reward": -7.108356952667236, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.542879104614258, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.75390625, + "step": 605, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982647895812988 + }, + { + "episode": 9712, + "epoch": 0.05818983595163629, + "loss/policy_avg": 0.504492461681366, + "lr": 9.612730061349694e-06, + "objective/entropy": 101.26957702636719, + "objective/kl": 54.091217041015625, + "objective/non_score_reward": -2.7045607566833496, + "objective/rlhf_reward": -12.818243026733398, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.624460220336914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.73828125, + "step": 606, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990702867507935 + }, + { + "episode": 9728, + "epoch": 0.05828570059076584, + "loss/policy_avg": 0.21602584421634674, + "lr": 9.612091002044991e-06, + "objective/entropy": 148.65980529785156, + "objective/kl": 45.02153778076172, + "objective/non_score_reward": -2.2510766983032227, + "objective/rlhf_reward": -9.00430679321289, + "objective/scores": 0.0, + "policy/approxkl_avg": 113.05381774902344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4365234375, + "step": 607, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995574951171875 + }, + { + "episode": 9744, + "epoch": 0.05838156522989539, + "loss/policy_avg": 0.29388415813446045, + "lr": 9.611451942740288e-06, + "objective/entropy": -23.157203674316406, + "objective/kl": 37.21900177001953, + "objective/non_score_reward": -1.860949993133545, + "objective/rlhf_reward": -9.44379997253418, + "objective/scores": -0.5, + "policy/approxkl_avg": 77.32159423828125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5859375, + "step": 608, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001188278198242 + }, + { + "episode": 9760, + "epoch": 0.05847742986902494, + "loss/policy_avg": 0.11250358074903488, + "lr": 9.610812883435585e-06, + "objective/entropy": 140.81295776367188, + "objective/kl": 43.59325408935547, + "objective/non_score_reward": -2.1796627044677734, + "objective/rlhf_reward": -10.718650817871094, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.5329480171203613, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.447265625, + "step": 609, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0009548664093018 + }, + { + "episode": 9776, + "epoch": 0.05857329450815449, + "loss/policy_avg": 0.10359665751457214, + "lr": 9.61017382413088e-06, + "objective/entropy": 14.696422576904297, + "objective/kl": 47.55508041381836, + "objective/non_score_reward": -2.3777542114257812, + "objective/rlhf_reward": -7.388310613409553, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.9345924854278564, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.86328125, + "step": 610, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000143527984619 + }, + { + "episode": 9792, + "epoch": 0.05866915914728404, + "loss/policy_avg": 1.0268946886062622, + "lr": 9.609534764826177e-06, + "objective/entropy": 26.149616241455078, + "objective/kl": 43.42169189453125, + "objective/non_score_reward": -2.1710846424102783, + "objective/rlhf_reward": -10.684338569641113, + "objective/scores": -0.5, + "policy/approxkl_avg": 151.4269561767578, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.490234375, + "step": 611, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999128580093384 + }, + { + "episode": 9808, + "epoch": 0.05876502378641359, + "loss/policy_avg": -0.3294616937637329, + "lr": 9.608895705521472e-06, + "objective/entropy": 127.65898895263672, + "objective/kl": 51.51044845581055, + "objective/non_score_reward": -2.5755224227905273, + "objective/rlhf_reward": -12.30208969116211, + "objective/scores": -0.5, + "policy/approxkl_avg": 87.01832580566406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.58203125, + "step": 612, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.005786418914795 + }, + { + "episode": 9824, + "epoch": 0.058860888425543136, + "loss/policy_avg": 0.2266431748867035, + "lr": 9.608256646216769e-06, + "objective/entropy": -102.50221252441406, + "objective/kl": 53.019676208496094, + "objective/non_score_reward": -2.6509838104248047, + "objective/rlhf_reward": -9.153337101550445, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 187.98016357421875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4931640625, + "step": 613, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99696946144104 + }, + { + "episode": 9840, + "epoch": 0.058956753064672685, + "loss/policy_avg": 0.6474194526672363, + "lr": 9.607617586912066e-06, + "objective/entropy": 235.2186279296875, + "objective/kl": 45.518550872802734, + "objective/non_score_reward": -2.2759275436401367, + "objective/rlhf_reward": -7.278882022174905, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 28.18163299560547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6640625, + "step": 614, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99918794631958 + }, + { + "episode": 9856, + "epoch": 0.05905261770380223, + "loss/policy_avg": 0.3383353352546692, + "lr": 9.606978527607363e-06, + "objective/entropy": 129.2579803466797, + "objective/kl": 42.21803283691406, + "objective/non_score_reward": -2.110901355743408, + "objective/rlhf_reward": -4.043605899810791, + "objective/scores": 1.1, + "policy/approxkl_avg": 5.800074577331543, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.75, + "step": 615, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993865489959717 + }, + { + "episode": 9872, + "epoch": 0.05914848234293178, + "loss/policy_avg": 0.5234297513961792, + "lr": 9.60633946830266e-06, + "objective/entropy": 128.2353973388672, + "objective/kl": 50.82809829711914, + "objective/non_score_reward": -2.541405200958252, + "objective/rlhf_reward": -12.165620803833008, + "objective/scores": -0.5, + "policy/approxkl_avg": 77.38518524169922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.41796875, + "step": 616, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000047206878662 + }, + { + "episode": 9888, + "epoch": 0.05924434698206133, + "loss/policy_avg": 0.5637346506118774, + "lr": 9.605700408997955e-06, + "objective/entropy": 32.960811614990234, + "objective/kl": 47.912315368652344, + "objective/non_score_reward": -2.395615816116333, + "objective/rlhf_reward": -8.158631165226069, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 85.89846801757812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4775390625, + "step": 617, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981329441070557 + }, + { + "episode": 9904, + "epoch": 0.05934021162119088, + "loss/policy_avg": 0.29263371229171753, + "lr": 9.605061349693252e-06, + "objective/entropy": 91.35547637939453, + "objective/kl": 29.78053092956543, + "objective/non_score_reward": -1.4890265464782715, + "objective/rlhf_reward": -4.008694956974919, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 66.5166244506836, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4091796875, + "step": 618, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9947412014007568 + }, + { + "episode": 9920, + "epoch": 0.05943607626032043, + "loss/policy_avg": 0.4225977063179016, + "lr": 9.604422290388548e-06, + "objective/entropy": -5.4639129638671875, + "objective/kl": 39.38916015625, + "objective/non_score_reward": -1.9694581031799316, + "objective/rlhf_reward": -6.321573107448176, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 24.45441246032715, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.466796875, + "step": 619, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975175857543945 + }, + { + "episode": 9936, + "epoch": 0.05953194089944998, + "loss/policy_avg": 0.4042114019393921, + "lr": 9.603783231083845e-06, + "objective/entropy": 7.100193023681641, + "objective/kl": 36.405548095703125, + "objective/non_score_reward": -1.820277214050293, + "objective/rlhf_reward": -7.281109094619751, + "objective/scores": 0.0, + "policy/approxkl_avg": 46.96625518798828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.779296875, + "step": 620, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974925518035889 + }, + { + "episode": 9952, + "epoch": 0.05962780553857953, + "loss/policy_avg": 0.15426188707351685, + "lr": 9.603144171779142e-06, + "objective/entropy": -8.429317474365234, + "objective/kl": 43.06403350830078, + "objective/non_score_reward": -2.1532018184661865, + "objective/rlhf_reward": -7.162208656878814, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 11.006355285644531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3974609375, + "step": 621, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989612102508545 + }, + { + "episode": 9968, + "epoch": 0.059723670177709076, + "loss/policy_avg": 0.3110625445842743, + "lr": 9.602505112474439e-06, + "objective/entropy": 94.2621078491211, + "objective/kl": 44.61369323730469, + "objective/non_score_reward": -2.230684757232666, + "objective/rlhf_reward": -5.999019299389097, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 6.9564127922058105, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 622, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0071253776550293 + }, + { + "episode": 9984, + "epoch": 0.059819534816838625, + "loss/policy_avg": 4.126076698303223, + "lr": 9.601866053169734e-06, + "objective/entropy": 120.39205932617188, + "objective/kl": 39.662933349609375, + "objective/non_score_reward": -1.9831466674804688, + "objective/rlhf_reward": -3.532586789131164, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.544063091278076, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62890625, + "step": 623, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0069477558135986 + }, + { + "episode": 10000, + "epoch": 0.059915399455968174, + "loss/policy_avg": 0.16749505698680878, + "lr": 9.601226993865031e-06, + "objective/entropy": 232.59893798828125, + "objective/kl": 57.862274169921875, + "objective/non_score_reward": -2.893113851547241, + "objective/rlhf_reward": -13.572455406188965, + "objective/scores": -0.5, + "policy/approxkl_avg": 19.45920753479004, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.736328125, + "step": 624, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977095127105713 + }, + { + "episode": 10016, + "epoch": 0.06001126409509772, + "loss/policy_avg": 0.1318892389535904, + "lr": 9.600587934560328e-06, + "objective/entropy": 79.07177734375, + "objective/kl": 49.18446731567383, + "objective/non_score_reward": -2.4592232704162598, + "objective/rlhf_reward": -8.413061101635066, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 6.696374893188477, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4609375, + "step": 625, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9964910745620728 + }, + { + "episode": 10032, + "epoch": 0.06010712873422727, + "loss/policy_avg": 0.01355069875717163, + "lr": 9.599948875255625e-06, + "objective/entropy": 127.10783386230469, + "objective/kl": 41.830116271972656, + "objective/non_score_reward": -2.091505765914917, + "objective/rlhf_reward": -6.885070445950388, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 4.951809406280518, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.404296875, + "step": 626, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0029428005218506 + }, + { + "episode": 10048, + "epoch": 0.06020299337335682, + "loss/policy_avg": 0.050556108355522156, + "lr": 9.599309815950922e-06, + "objective/entropy": 201.34024047851562, + "objective/kl": 47.595428466796875, + "objective/non_score_reward": -2.3797712326049805, + "objective/rlhf_reward": -9.51908528804779, + "objective/scores": 0.0, + "policy/approxkl_avg": 108.17556762695312, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.583984375, + "step": 627, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.011460542678833 + }, + { + "episode": 10064, + "epoch": 0.06029885801248637, + "loss/policy_avg": -0.029277600347995758, + "lr": 9.598670756646217e-06, + "objective/entropy": -19.550357818603516, + "objective/kl": 42.81761932373047, + "objective/non_score_reward": -2.140881061553955, + "objective/rlhf_reward": -7.082571628506541, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 26.876096725463867, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.52734375, + "step": 628, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979857206344604 + }, + { + "episode": 10080, + "epoch": 0.06039472265161592, + "loss/policy_avg": 0.8738146424293518, + "lr": 9.598031697341514e-06, + "objective/entropy": 128.0510711669922, + "objective/kl": 39.39387130737305, + "objective/non_score_reward": -1.969693660736084, + "objective/rlhf_reward": -9.87877368927002, + "objective/scores": -0.5, + "policy/approxkl_avg": 52.67377471923828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.501953125, + "step": 629, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9967011213302612 + }, + { + "episode": 10096, + "epoch": 0.06049058729074547, + "loss/policy_avg": 0.1045270562171936, + "lr": 9.59739263803681e-06, + "objective/entropy": 197.73403930664062, + "objective/kl": 49.12194061279297, + "objective/non_score_reward": -2.45609712600708, + "objective/rlhf_reward": -7.876977275090153, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 21.36578369140625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.59765625, + "step": 630, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.009080171585083 + }, + { + "episode": 10112, + "epoch": 0.06058645192987502, + "loss/policy_avg": 0.22648468613624573, + "lr": 9.596753578732108e-06, + "objective/entropy": -114.70201110839844, + "objective/kl": 30.594161987304688, + "objective/non_score_reward": -1.5297081470489502, + "objective/rlhf_reward": -6.11883282661438, + "objective/scores": 0.0, + "policy/approxkl_avg": 39.865116119384766, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.603515625, + "step": 631, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99720299243927 + }, + { + "episode": 10128, + "epoch": 0.060682316569004566, + "loss/policy_avg": 0.38777509331703186, + "lr": 9.596114519427405e-06, + "objective/entropy": 67.81834411621094, + "objective/kl": 53.22407531738281, + "objective/non_score_reward": -2.6612038612365723, + "objective/rlhf_reward": -12.644815444946289, + "objective/scores": -0.5, + "policy/approxkl_avg": 70.54733276367188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 632, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000068187713623 + }, + { + "episode": 10144, + "epoch": 0.060778181208134115, + "loss/policy_avg": 0.16470590233802795, + "lr": 9.595475460122701e-06, + "objective/entropy": 163.7096710205078, + "objective/kl": 42.381324768066406, + "objective/non_score_reward": -2.1190662384033203, + "objective/rlhf_reward": -7.134630015402465, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 49.23904800415039, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.515625, + "step": 633, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990718364715576 + }, + { + "episode": 10160, + "epoch": 0.060874045847263664, + "loss/policy_avg": 0.05951453745365143, + "lr": 9.594836400817997e-06, + "objective/entropy": 197.98974609375, + "objective/kl": 42.54900360107422, + "objective/non_score_reward": -2.1274502277374268, + "objective/rlhf_reward": -10.509800910949707, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.556913375854492, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.578125, + "step": 634, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001404047012329 + }, + { + "episode": 10176, + "epoch": 0.06096991048639321, + "loss/policy_avg": 0.06633798778057098, + "lr": 9.594197341513293e-06, + "objective/entropy": 19.799213409423828, + "objective/kl": 43.157569885253906, + "objective/non_score_reward": -2.1578786373138428, + "objective/rlhf_reward": -8.63151478767395, + "objective/scores": 0.0, + "policy/approxkl_avg": 144.175048828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.58984375, + "step": 635, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999399185180664 + }, + { + "episode": 10192, + "epoch": 0.06106577512552276, + "loss/policy_avg": 0.5394022464752197, + "lr": 9.593558282208589e-06, + "objective/entropy": -21.35447120666504, + "objective/kl": 43.028507232666016, + "objective/non_score_reward": -2.151425361633301, + "objective/rlhf_reward": -10.605701446533203, + "objective/scores": -0.5, + "policy/approxkl_avg": 48.328895568847656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.595703125, + "step": 636, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997023105621338 + }, + { + "episode": 10208, + "epoch": 0.06116163976465231, + "loss/policy_avg": 0.3087373971939087, + "lr": 9.592919222903886e-06, + "objective/entropy": 143.51153564453125, + "objective/kl": 44.73971939086914, + "objective/non_score_reward": -2.236985683441162, + "objective/rlhf_reward": -10.947942733764648, + "objective/scores": -0.5, + "policy/approxkl_avg": 24.09835433959961, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.583984375, + "step": 637, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996269941329956 + }, + { + "episode": 10224, + "epoch": 0.06125750440378186, + "loss/policy_avg": 0.08359929919242859, + "lr": 9.592280163599182e-06, + "objective/entropy": 83.88134765625, + "objective/kl": 27.66678237915039, + "objective/non_score_reward": -1.3833391666412354, + "objective/rlhf_reward": -5.533356785774231, + "objective/scores": 0.0, + "policy/approxkl_avg": 118.91204833984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 638, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976441860198975 + }, + { + "episode": 10240, + "epoch": 0.06135336904291141, + "loss/policy_avg": 0.33437931537628174, + "lr": 9.59164110429448e-06, + "objective/entropy": 81.66073608398438, + "objective/kl": 42.31568908691406, + "objective/non_score_reward": -2.1157844066619873, + "objective/rlhf_reward": -7.039305646617976, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 12.253518104553223, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.2958984375, + "step": 639, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980757236480713 + }, + { + "episode": 10256, + "epoch": 0.06144923368204096, + "loss/policy_avg": 0.8385659456253052, + "lr": 9.591002044989776e-06, + "objective/entropy": 17.857593536376953, + "objective/kl": 34.342498779296875, + "objective/non_score_reward": -1.7171249389648438, + "objective/rlhf_reward": -8.868499755859375, + "objective/scores": -0.5, + "policy/approxkl_avg": 148.35433959960938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.541015625, + "step": 640, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003573894500732 + }, + { + "episode": 10272, + "epoch": 0.061545098321170506, + "loss/policy_avg": 0.10360659658908844, + "lr": 9.590362985685071e-06, + "objective/entropy": 182.46246337890625, + "objective/kl": 49.80595397949219, + "objective/non_score_reward": -2.490297794342041, + "objective/rlhf_reward": -11.961191177368164, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.64073944091797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.796875, + "step": 641, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0005903244018555 + }, + { + "episode": 10288, + "epoch": 0.061640962960300055, + "loss/policy_avg": 0.605118989944458, + "lr": 9.589723926380368e-06, + "objective/entropy": 131.2631378173828, + "objective/kl": 50.88385772705078, + "objective/non_score_reward": -2.5441927909851074, + "objective/rlhf_reward": -8.817521536086483, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 16.61669921875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4345703125, + "step": 642, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990119934082031 + }, + { + "episode": 10304, + "epoch": 0.061736827599429604, + "loss/policy_avg": 0.24426089227199554, + "lr": 9.589084867075665e-06, + "objective/entropy": 80.13182067871094, + "objective/kl": 54.13704299926758, + "objective/non_score_reward": -2.7068519592285156, + "objective/rlhf_reward": -12.827407836914062, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.57849884033203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4326171875, + "step": 643, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9961745738983154 + }, + { + "episode": 10320, + "epoch": 0.06183269223855915, + "loss/policy_avg": 0.12039253860712051, + "lr": 9.588445807770962e-06, + "objective/entropy": 219.00091552734375, + "objective/kl": 27.824764251708984, + "objective/non_score_reward": -1.3912383317947388, + "objective/rlhf_reward": -4.049181544574436, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 191.24160766601562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8515625, + "step": 644, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0041441917419434 + }, + { + "episode": 10336, + "epoch": 0.0619285568776887, + "loss/policy_avg": 0.7645555734634399, + "lr": 9.587806748466259e-06, + "objective/entropy": 186.13551330566406, + "objective/kl": 60.495948791503906, + "objective/non_score_reward": -3.0247974395751953, + "objective/rlhf_reward": -14.099189758300781, + "objective/scores": -0.5, + "policy/approxkl_avg": 286.13507080078125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.63671875, + "step": 645, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.994465708732605 + }, + { + "episode": 10352, + "epoch": 0.06202442151681825, + "loss/policy_avg": 0.5267387628555298, + "lr": 9.587167689161556e-06, + "objective/entropy": 120.89801788330078, + "objective/kl": 46.46590805053711, + "objective/non_score_reward": -2.3232955932617188, + "objective/rlhf_reward": -11.293182373046875, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.699556350708008, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.587890625, + "step": 646, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987261295318604 + }, + { + "episode": 10368, + "epoch": 0.0621202861559478, + "loss/policy_avg": 1.7349122762680054, + "lr": 9.586528629856851e-06, + "objective/entropy": 137.45834350585938, + "objective/kl": 54.72475051879883, + "objective/non_score_reward": -2.7362375259399414, + "objective/rlhf_reward": -10.944950342178345, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.560152053833008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.501953125, + "step": 647, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0040972232818604 + }, + { + "episode": 10384, + "epoch": 0.06221615079507735, + "loss/policy_avg": 1.7710604667663574, + "lr": 9.585889570552148e-06, + "objective/entropy": 69.20938110351562, + "objective/kl": 28.258338928222656, + "objective/non_score_reward": -1.4129170179367065, + "objective/rlhf_reward": -3.251668310165405, + "objective/scores": 0.6, + "policy/approxkl_avg": 12.532756805419922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.57421875, + "step": 648, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99672532081604 + }, + { + "episode": 10400, + "epoch": 0.0623120154342069, + "loss/policy_avg": -0.4587554633617401, + "lr": 9.585250511247445e-06, + "objective/entropy": 118.23152160644531, + "objective/kl": 41.172386169433594, + "objective/non_score_reward": -2.058619260787964, + "objective/rlhf_reward": -8.234476923942566, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.415938377380371, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.619140625, + "step": 649, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0050253868103027 + }, + { + "episode": 10416, + "epoch": 0.06240788007333645, + "loss/policy_avg": 0.05152188241481781, + "lr": 9.584611451942742e-06, + "objective/entropy": 111.10828399658203, + "objective/kl": 60.79473876953125, + "objective/non_score_reward": -3.0397372245788574, + "objective/rlhf_reward": -12.158948183059692, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.2168221473693848, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.71875, + "step": 650, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0011370182037354 + }, + { + "episode": 10432, + "epoch": 0.062503744712466, + "loss/policy_avg": 0.1416802853345871, + "lr": 9.583972392638038e-06, + "objective/entropy": 53.75664520263672, + "objective/kl": 48.958587646484375, + "objective/non_score_reward": -2.4479293823242188, + "objective/rlhf_reward": -11.791717529296875, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.57497024536133, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.68359375, + "step": 651, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000466823577881 + }, + { + "episode": 10448, + "epoch": 0.06259960935159554, + "loss/policy_avg": 0.2908247709274292, + "lr": 9.583333333333335e-06, + "objective/entropy": -30.46204376220703, + "objective/kl": 38.81657791137695, + "objective/non_score_reward": -1.940828800201416, + "objective/rlhf_reward": -7.763315439224243, + "objective/scores": 0.0, + "policy/approxkl_avg": 25.528799057006836, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 652, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997721791267395 + }, + { + "episode": 10464, + "epoch": 0.0626954739907251, + "loss/policy_avg": 0.5172910094261169, + "lr": 9.58269427402863e-06, + "objective/entropy": 86.23279571533203, + "objective/kl": 44.31052017211914, + "objective/non_score_reward": -2.2155261039733887, + "objective/rlhf_reward": -10.862104415893555, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.40749740600586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 653, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0023417472839355 + }, + { + "episode": 10480, + "epoch": 0.06279133862985464, + "loss/policy_avg": 0.39073634147644043, + "lr": 9.582055214723927e-06, + "objective/entropy": 100.01063537597656, + "objective/kl": 47.22362518310547, + "objective/non_score_reward": -2.3611814975738525, + "objective/rlhf_reward": -11.44472599029541, + "objective/scores": -0.5, + "policy/approxkl_avg": 60.52268981933594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.2978515625, + "step": 654, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997495412826538 + }, + { + "episode": 10496, + "epoch": 0.0628872032689842, + "loss/policy_avg": 1.133047342300415, + "lr": 9.581416155419224e-06, + "objective/entropy": 159.82931518554688, + "objective/kl": 40.75773620605469, + "objective/non_score_reward": -2.03788685798645, + "objective/rlhf_reward": -10.151546478271484, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.9608802795410156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4453125, + "step": 655, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015578269958496 + }, + { + "episode": 10512, + "epoch": 0.06298306790811374, + "loss/policy_avg": 0.22243613004684448, + "lr": 9.58077709611452e-06, + "objective/entropy": 223.8974609375, + "objective/kl": 48.50587463378906, + "objective/non_score_reward": -2.4252936840057373, + "objective/rlhf_reward": -9.701175212860107, + "objective/scores": 0.0, + "policy/approxkl_avg": 93.94710540771484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.685546875, + "step": 656, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992177486419678 + }, + { + "episode": 10528, + "epoch": 0.0630789325472433, + "loss/policy_avg": 0.3326851725578308, + "lr": 9.580138036809816e-06, + "objective/entropy": 55.81219482421875, + "objective/kl": 50.48834228515625, + "objective/non_score_reward": -2.5244171619415283, + "objective/rlhf_reward": -10.097668766975403, + "objective/scores": 0.0, + "policy/approxkl_avg": 73.96626281738281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.728515625, + "step": 657, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999157190322876 + }, + { + "episode": 10544, + "epoch": 0.06317479718637284, + "loss/policy_avg": 0.05004708841443062, + "lr": 9.579498977505113e-06, + "objective/entropy": -11.130489349365234, + "objective/kl": 46.011192321777344, + "objective/non_score_reward": -2.3005595207214355, + "objective/rlhf_reward": -7.842988455031795, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 15.272626876831055, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4716796875, + "step": 658, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0058817863464355 + }, + { + "episode": 10560, + "epoch": 0.0632706618255024, + "loss/policy_avg": 0.08118234574794769, + "lr": 9.57885991820041e-06, + "objective/entropy": 146.0076904296875, + "objective/kl": 45.32005310058594, + "objective/non_score_reward": -2.266002655029297, + "objective/rlhf_reward": -11.064010620117188, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.8377790451049805, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6015625, + "step": 659, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000117778778076 + }, + { + "episode": 10576, + "epoch": 0.06336652646463194, + "loss/policy_avg": 1.6007329225540161, + "lr": 9.578220858895705e-06, + "objective/entropy": 198.6329345703125, + "objective/kl": 42.26470947265625, + "objective/non_score_reward": -2.1132352352142334, + "objective/rlhf_reward": -6.505530188755925, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 30.21502685546875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.595703125, + "step": 660, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00093150138855 + }, + { + "episode": 10592, + "epoch": 0.06346239110376149, + "loss/policy_avg": 1.8413606882095337, + "lr": 9.577581799591002e-06, + "objective/entropy": -39.25248718261719, + "objective/kl": 46.20891571044922, + "objective/non_score_reward": -2.31044602394104, + "objective/rlhf_reward": -11.24178409576416, + "objective/scores": -0.5, + "policy/approxkl_avg": 25.374454498291016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.498046875, + "step": 661, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000864028930664 + }, + { + "episode": 10608, + "epoch": 0.06355825574289103, + "loss/policy_avg": 0.3250480890274048, + "lr": 9.576942740286299e-06, + "objective/entropy": -94.04759216308594, + "objective/kl": 49.534393310546875, + "objective/non_score_reward": -2.476719617843628, + "objective/rlhf_reward": -9.906878232955933, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.400192260742188, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.501953125, + "step": 662, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0018579959869385 + }, + { + "episode": 10624, + "epoch": 0.06365412038202059, + "loss/policy_avg": 0.25218451023101807, + "lr": 9.576303680981596e-06, + "objective/entropy": -9.278493881225586, + "objective/kl": 31.44525146484375, + "objective/non_score_reward": -1.5722625255584717, + "objective/rlhf_reward": -4.732790796962336, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 11.464608192443848, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.46875, + "step": 663, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993681907653809 + }, + { + "episode": 10640, + "epoch": 0.06374998502115013, + "loss/policy_avg": 0.17636987566947937, + "lr": 9.575664621676893e-06, + "objective/entropy": 200.01724243164062, + "objective/kl": 42.763118743896484, + "objective/non_score_reward": -2.138155937194824, + "objective/rlhf_reward": -10.552623748779297, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.643232822418213, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.68359375, + "step": 664, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.008737802505493 + }, + { + "episode": 10656, + "epoch": 0.06384584966027969, + "loss/policy_avg": -0.019399195909500122, + "lr": 9.57502556237219e-06, + "objective/entropy": 132.64715576171875, + "objective/kl": 49.362979888916016, + "objective/non_score_reward": -2.468149185180664, + "objective/rlhf_reward": -11.872596740722656, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.51764678955078, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4716796875, + "step": 665, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008597373962402 + }, + { + "episode": 10672, + "epoch": 0.06394171429940923, + "loss/policy_avg": 1.0311392545700073, + "lr": 9.574386503067485e-06, + "objective/entropy": 46.73440170288086, + "objective/kl": 34.47108459472656, + "objective/non_score_reward": -1.7235543727874756, + "objective/rlhf_reward": -6.894217133522034, + "objective/scores": 0.0, + "policy/approxkl_avg": 73.97438049316406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.650390625, + "step": 666, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001150608062744 + }, + { + "episode": 10688, + "epoch": 0.06403757893853879, + "loss/policy_avg": 0.47999733686447144, + "lr": 9.573747443762782e-06, + "objective/entropy": -65.0555419921875, + "objective/kl": 27.064815521240234, + "objective/non_score_reward": -1.353240966796875, + "objective/rlhf_reward": -5.412964165210724, + "objective/scores": 0.0, + "policy/approxkl_avg": 32.14912796020508, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.66015625, + "step": 667, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980639219284058 + }, + { + "episode": 10704, + "epoch": 0.06413344357766833, + "loss/policy_avg": 0.1448586881160736, + "lr": 9.573108384458079e-06, + "objective/entropy": 5.8227996826171875, + "objective/kl": 44.61048126220703, + "objective/non_score_reward": -2.2305238246917725, + "objective/rlhf_reward": -10.92209529876709, + "objective/scores": -0.5, + "policy/approxkl_avg": 188.19000244140625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6640625, + "step": 668, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998194694519043 + }, + { + "episode": 10720, + "epoch": 0.06422930821679788, + "loss/policy_avg": 0.22846150398254395, + "lr": 9.572469325153375e-06, + "objective/entropy": -141.0145263671875, + "objective/kl": 34.947044372558594, + "objective/non_score_reward": -1.747352123260498, + "objective/rlhf_reward": -8.989408493041992, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.358154296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 669, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9974150657653809 + }, + { + "episode": 10736, + "epoch": 0.06432517285592743, + "loss/policy_avg": 0.6482763290405273, + "lr": 9.571830265848672e-06, + "objective/entropy": 133.55731201171875, + "objective/kl": 43.53638458251953, + "objective/non_score_reward": -2.176819324493408, + "objective/rlhf_reward": -4.307277536392212, + "objective/scores": 1.1, + "policy/approxkl_avg": 86.96247863769531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 670, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9970569610595703 + }, + { + "episode": 10752, + "epoch": 0.06442103749505698, + "loss/policy_avg": 0.31093263626098633, + "lr": 9.571191206543968e-06, + "objective/entropy": 59.441810607910156, + "objective/kl": 46.33723449707031, + "objective/non_score_reward": -2.316861629486084, + "objective/rlhf_reward": -7.888844587890011, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 26.443016052246094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.638671875, + "step": 671, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999953031539917 + }, + { + "episode": 10768, + "epoch": 0.06451690213418652, + "loss/policy_avg": 1.3164701461791992, + "lr": 9.570552147239264e-06, + "objective/entropy": 171.2152099609375, + "objective/kl": 29.308670043945312, + "objective/non_score_reward": -1.4654334783554077, + "objective/rlhf_reward": -4.502484047149105, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 79.94940948486328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.638671875, + "step": 672, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979584217071533 + }, + { + "episode": 10784, + "epoch": 0.06461276677331608, + "loss/policy_avg": 0.07863028347492218, + "lr": 9.569913087934561e-06, + "objective/entropy": 164.44989013671875, + "objective/kl": 39.448951721191406, + "objective/non_score_reward": -1.972447395324707, + "objective/rlhf_reward": -7.889789819717407, + "objective/scores": 0.0, + "policy/approxkl_avg": 50.970458984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 673, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985225200653076 + }, + { + "episode": 10800, + "epoch": 0.06470863141244562, + "loss/policy_avg": 0.10430300980806351, + "lr": 9.569274028629858e-06, + "objective/entropy": 84.69925689697266, + "objective/kl": 36.72293472290039, + "objective/non_score_reward": -1.8361468315124512, + "objective/rlhf_reward": -7.344587206840515, + "objective/scores": 0.0, + "policy/approxkl_avg": 122.58232116699219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.625, + "step": 674, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9983776807785034 + }, + { + "episode": 10816, + "epoch": 0.06480449605157518, + "loss/policy_avg": 0.4454188644886017, + "lr": 9.568634969325155e-06, + "objective/entropy": 64.41371154785156, + "objective/kl": 49.93177795410156, + "objective/non_score_reward": -2.496588706970215, + "objective/rlhf_reward": -11.98635482788086, + "objective/scores": -0.5, + "policy/approxkl_avg": 48.835880279541016, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.591796875, + "step": 675, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982514381408691 + }, + { + "episode": 10832, + "epoch": 0.06490036069070472, + "loss/policy_avg": 0.5164119005203247, + "lr": 9.567995910020452e-06, + "objective/entropy": 67.62659454345703, + "objective/kl": 37.88905715942383, + "objective/non_score_reward": -1.8944528102874756, + "objective/rlhf_reward": -9.577811241149902, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.208919525146484, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.66796875, + "step": 676, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9954957962036133 + }, + { + "episode": 10848, + "epoch": 0.06499622532983428, + "loss/policy_avg": 0.35832202434539795, + "lr": 9.567356850715747e-06, + "objective/entropy": -25.133766174316406, + "objective/kl": 46.228153228759766, + "objective/non_score_reward": -2.3114078044891357, + "objective/rlhf_reward": -11.245631217956543, + "objective/scores": -0.5, + "policy/approxkl_avg": 19.48291778564453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55078125, + "step": 677, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990386962890625 + }, + { + "episode": 10864, + "epoch": 0.06509208996896382, + "loss/policy_avg": 0.2558044195175171, + "lr": 9.566717791411044e-06, + "objective/entropy": -49.5452995300293, + "objective/kl": 25.26848602294922, + "objective/non_score_reward": -1.2634243965148926, + "objective/rlhf_reward": -7.05369758605957, + "objective/scores": -0.5, + "policy/approxkl_avg": 137.56399536132812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.712890625, + "step": 678, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979135990142822 + }, + { + "episode": 10880, + "epoch": 0.06518795460809337, + "loss/policy_avg": 0.33528435230255127, + "lr": 9.56607873210634e-06, + "objective/entropy": 82.0921630859375, + "objective/kl": 45.25497817993164, + "objective/non_score_reward": -2.262749195098877, + "objective/rlhf_reward": -9.05099630355835, + "objective/scores": 0.0, + "policy/approxkl_avg": 141.86026000976562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.51171875, + "step": 679, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980320930480957 + }, + { + "episode": 10896, + "epoch": 0.06528381924722292, + "loss/policy_avg": 0.26595377922058105, + "lr": 9.565439672801636e-06, + "objective/entropy": -16.28606414794922, + "objective/kl": 39.699378967285156, + "objective/non_score_reward": -1.984969139099121, + "objective/rlhf_reward": -9.939876556396484, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.83110237121582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.365234375, + "step": 680, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976716041564941 + }, + { + "episode": 10912, + "epoch": 0.06537968388635247, + "loss/policy_avg": -0.07005812227725983, + "lr": 9.564800613496933e-06, + "objective/entropy": 196.90921020507812, + "objective/kl": 35.69910430908203, + "objective/non_score_reward": -1.7849552631378174, + "objective/rlhf_reward": -7.1398210525512695, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.183989524841309, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6015625, + "step": 681, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000659942626953 + }, + { + "episode": 10928, + "epoch": 0.06547554852548201, + "loss/policy_avg": 0.34592947363853455, + "lr": 9.56416155419223e-06, + "objective/entropy": 93.98797607421875, + "objective/kl": 45.66193771362305, + "objective/non_score_reward": -2.2830967903137207, + "objective/rlhf_reward": -7.009680690542732, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 25.68501091003418, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 682, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988491535186768 + }, + { + "episode": 10944, + "epoch": 0.06557141316461157, + "loss/policy_avg": 0.3599075675010681, + "lr": 9.563522494887527e-06, + "objective/entropy": 115.81468200683594, + "objective/kl": 45.09308624267578, + "objective/non_score_reward": -2.2546539306640625, + "objective/rlhf_reward": -11.01861572265625, + "objective/scores": -0.5, + "policy/approxkl_avg": 92.67312622070312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4169921875, + "step": 683, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991455078125 + }, + { + "episode": 10960, + "epoch": 0.06566727780374111, + "loss/policy_avg": 0.701302170753479, + "lr": 9.562883435582822e-06, + "objective/entropy": 143.72076416015625, + "objective/kl": 54.24695587158203, + "objective/non_score_reward": -2.712347984313965, + "objective/rlhf_reward": -10.84939169883728, + "objective/scores": 0.0, + "policy/approxkl_avg": 46.84626007080078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.46875, + "step": 684, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998202919960022 + }, + { + "episode": 10976, + "epoch": 0.06576314244287067, + "loss/policy_avg": 0.5290093421936035, + "lr": 9.562244376278119e-06, + "objective/entropy": 43.091392517089844, + "objective/kl": 35.95604705810547, + "objective/non_score_reward": -1.7978023290634155, + "objective/rlhf_reward": -7.191209495067596, + "objective/scores": 0.0, + "policy/approxkl_avg": 11.957923889160156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.60546875, + "step": 685, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0012266635894775 + }, + { + "episode": 10992, + "epoch": 0.06585900708200021, + "loss/policy_avg": 0.10781164467334747, + "lr": 9.561605316973416e-06, + "objective/entropy": -93.17870330810547, + "objective/kl": 40.609893798828125, + "objective/non_score_reward": -2.0304946899414062, + "objective/rlhf_reward": -10.121978759765625, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.1215343475341797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.578125, + "step": 686, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0011990070343018 + }, + { + "episode": 11008, + "epoch": 0.06595487172112977, + "loss/policy_avg": 1.0219621658325195, + "lr": 9.560966257668713e-06, + "objective/entropy": -3.830078125, + "objective/kl": 50.50682067871094, + "objective/non_score_reward": -2.5253407955169678, + "objective/rlhf_reward": -10.10136342048645, + "objective/scores": 0.0, + "policy/approxkl_avg": 47.36224365234375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.60546875, + "step": 687, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977154731750488 + }, + { + "episode": 11024, + "epoch": 0.06605073636025931, + "loss/policy_avg": 0.062187694013118744, + "lr": 9.56032719836401e-06, + "objective/entropy": 54.81780242919922, + "objective/kl": 54.68252182006836, + "objective/non_score_reward": -2.734126091003418, + "objective/rlhf_reward": -10.936504244804382, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.563399314880371, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3525390625, + "step": 688, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000232696533203 + }, + { + "episode": 11040, + "epoch": 0.06614660099938886, + "loss/policy_avg": 0.7583179473876953, + "lr": 9.559688139059306e-06, + "objective/entropy": 27.036727905273438, + "objective/kl": 58.335323333740234, + "objective/non_score_reward": -2.91676664352417, + "objective/rlhf_reward": -13.66706657409668, + "objective/scores": -0.5, + "policy/approxkl_avg": 115.37298583984375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.630859375, + "step": 689, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.994452953338623 + }, + { + "episode": 11056, + "epoch": 0.0662424656385184, + "loss/policy_avg": 0.4621831774711609, + "lr": 9.559049079754601e-06, + "objective/entropy": 100.6329345703125, + "objective/kl": 37.904075622558594, + "objective/non_score_reward": -1.8952038288116455, + "objective/rlhf_reward": -7.5808151960372925, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.872805118560791, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46484375, + "step": 690, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998521327972412 + }, + { + "episode": 11072, + "epoch": 0.06633833027764796, + "loss/policy_avg": 0.5286589860916138, + "lr": 9.558410020449898e-06, + "objective/entropy": 28.099346160888672, + "objective/kl": 43.15637969970703, + "objective/non_score_reward": -2.1578190326690674, + "objective/rlhf_reward": -8.631276488304138, + "objective/scores": 0.0, + "policy/approxkl_avg": 24.823148727416992, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 691, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005221366882324 + }, + { + "episode": 11088, + "epoch": 0.06643419491677752, + "loss/policy_avg": -0.19969536364078522, + "lr": 9.557770961145195e-06, + "objective/entropy": 118.29563903808594, + "objective/kl": 53.327049255371094, + "objective/non_score_reward": -2.6663525104522705, + "objective/rlhf_reward": -10.665409803390503, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6854915618896484, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5, + "step": 692, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002560615539551 + }, + { + "episode": 11104, + "epoch": 0.06653005955590706, + "loss/policy_avg": -0.4247640371322632, + "lr": 9.557131901840492e-06, + "objective/entropy": -71.97764587402344, + "objective/kl": 26.226512908935547, + "objective/non_score_reward": -1.3113257884979248, + "objective/rlhf_reward": -7.245303153991699, + "objective/scores": -0.5, + "policy/approxkl_avg": 123.49911499023438, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3984375, + "step": 693, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.034991979598999 + }, + { + "episode": 11120, + "epoch": 0.06662592419503661, + "loss/policy_avg": 1.5854076147079468, + "lr": 9.556492842535789e-06, + "objective/entropy": -111.44104766845703, + "objective/kl": 33.5144157409668, + "objective/non_score_reward": -1.6757209300994873, + "objective/rlhf_reward": -6.702884018421173, + "objective/scores": 0.0, + "policy/approxkl_avg": 30.64876937866211, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4697265625, + "step": 694, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0084142684936523 + }, + { + "episode": 11136, + "epoch": 0.06672178883416616, + "loss/policy_avg": 0.8775092959403992, + "lr": 9.555853783231084e-06, + "objective/entropy": 85.61629486083984, + "objective/kl": 51.17097091674805, + "objective/non_score_reward": -2.55854868888855, + "objective/rlhf_reward": -12.2341947555542, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.5744009017944336, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.755859375, + "step": 695, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000871419906616 + }, + { + "episode": 11152, + "epoch": 0.06681765347329571, + "loss/policy_avg": 0.7041283845901489, + "lr": 9.555214723926381e-06, + "objective/entropy": 94.05027770996094, + "objective/kl": 56.83818817138672, + "objective/non_score_reward": -2.841909408569336, + "objective/rlhf_reward": -9.886685254986643, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 319.5308837890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4013671875, + "step": 696, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9966342449188232 + }, + { + "episode": 11168, + "epoch": 0.06691351811242525, + "loss/policy_avg": 0.2705743610858917, + "lr": 9.554575664621678e-06, + "objective/entropy": 115.7253189086914, + "objective/kl": 46.57780456542969, + "objective/non_score_reward": -2.328890323638916, + "objective/rlhf_reward": -9.315561175346375, + "objective/scores": 0.0, + "policy/approxkl_avg": 53.3902587890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5703125, + "step": 697, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985805749893188 + }, + { + "episode": 11184, + "epoch": 0.06700938275155481, + "loss/policy_avg": -0.02218908816576004, + "lr": 9.553936605316975e-06, + "objective/entropy": -139.7073516845703, + "objective/kl": 38.22663879394531, + "objective/non_score_reward": -1.9113318920135498, + "objective/rlhf_reward": -9.6453275680542, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.588433265686035, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.677734375, + "step": 698, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0038301944732666 + }, + { + "episode": 11200, + "epoch": 0.06710524739068435, + "loss/policy_avg": 0.18402889370918274, + "lr": 9.553297546012272e-06, + "objective/entropy": -16.68351936340332, + "objective/kl": 58.843841552734375, + "objective/non_score_reward": -2.9421918392181396, + "objective/rlhf_reward": -10.287814977581858, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 80.47232055664062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.607421875, + "step": 699, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982250928878784 + }, + { + "episode": 11216, + "epoch": 0.06720111202981391, + "loss/policy_avg": 0.25904542207717896, + "lr": 9.552658486707569e-06, + "objective/entropy": -128.37562561035156, + "objective/kl": 25.527950286865234, + "objective/non_score_reward": -1.276397466659546, + "objective/rlhf_reward": -7.105590343475342, + "objective/scores": -0.5, + "policy/approxkl_avg": 104.87876892089844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859375, + "step": 700, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996747612953186 + }, + { + "episode": 11232, + "epoch": 0.06729697666894345, + "loss/policy_avg": 0.0923709124326706, + "lr": 9.552019427402864e-06, + "objective/entropy": 68.89668273925781, + "objective/kl": 44.4783821105957, + "objective/non_score_reward": -2.223918914794922, + "objective/rlhf_reward": -6.772969903723274, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 6.916808128356934, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.53125, + "step": 701, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992003440856934 + }, + { + "episode": 11248, + "epoch": 0.067392841308073, + "loss/policy_avg": 0.13767802715301514, + "lr": 9.55138036809816e-06, + "objective/entropy": 46.92123031616211, + "objective/kl": 56.957984924316406, + "objective/non_score_reward": -2.8478994369506836, + "objective/rlhf_reward": -13.391597747802734, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.40083312988281, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.630859375, + "step": 702, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975297451019287 + }, + { + "episode": 11264, + "epoch": 0.06748870594720255, + "loss/policy_avg": 0.08943480253219604, + "lr": 9.550741308793456e-06, + "objective/entropy": 140.6097412109375, + "objective/kl": 43.77709197998047, + "objective/non_score_reward": -2.188854694366455, + "objective/rlhf_reward": -8.75541877746582, + "objective/scores": 0.0, + "policy/approxkl_avg": 59.292110443115234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.685546875, + "step": 703, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0093445777893066 + }, + { + "episode": 11280, + "epoch": 0.0675845705863321, + "loss/policy_avg": 0.38621076941490173, + "lr": 9.550102249488753e-06, + "objective/entropy": 155.836669921875, + "objective/kl": 44.188323974609375, + "objective/non_score_reward": -2.209416389465332, + "objective/rlhf_reward": -6.890253852086003, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 29.918373107910156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.546875, + "step": 704, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005943775177 + }, + { + "episode": 11296, + "epoch": 0.06768043522546165, + "loss/policy_avg": -0.12805494666099548, + "lr": 9.54946319018405e-06, + "objective/entropy": 221.62393188476562, + "objective/kl": 56.21504211425781, + "objective/non_score_reward": -2.8107521533966064, + "objective/rlhf_reward": -11.243008494377136, + "objective/scores": 0.0, + "policy/approxkl_avg": 191.72616577148438, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.662109375, + "step": 705, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001768112182617 + }, + { + "episode": 11312, + "epoch": 0.0677762998645912, + "loss/policy_avg": 0.2601160407066345, + "lr": 9.548824130879346e-06, + "objective/entropy": 140.55947875976562, + "objective/kl": 41.73223114013672, + "objective/non_score_reward": -2.086611747741699, + "objective/rlhf_reward": -10.346446990966797, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.087754249572754, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.45703125, + "step": 706, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962522983551025 + }, + { + "episode": 11328, + "epoch": 0.06787216450372074, + "loss/policy_avg": 0.41686347126960754, + "lr": 9.548185071574643e-06, + "objective/entropy": 12.326494216918945, + "objective/kl": 47.36491394042969, + "objective/non_score_reward": -2.3682456016540527, + "objective/rlhf_reward": -8.094380714980465, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 3.6295084953308105, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4462890625, + "step": 707, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.009009838104248 + }, + { + "episode": 11344, + "epoch": 0.0679680291428503, + "loss/policy_avg": 1.148893117904663, + "lr": 9.547546012269938e-06, + "objective/entropy": -45.73768615722656, + "objective/kl": 41.53204345703125, + "objective/non_score_reward": -2.0766024589538574, + "objective/rlhf_reward": -8.306409001350403, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.269401550292969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4794921875, + "step": 708, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979407787322998 + }, + { + "episode": 11360, + "epoch": 0.06806389378197984, + "loss/policy_avg": 0.3864780068397522, + "lr": 9.546906952965235e-06, + "objective/entropy": 39.166866302490234, + "objective/kl": 51.79930114746094, + "objective/non_score_reward": -2.589965343475342, + "objective/rlhf_reward": -12.359861373901367, + "objective/scores": -0.5, + "policy/approxkl_avg": 15.646007537841797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.505859375, + "step": 709, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990487098693848 + }, + { + "episode": 11376, + "epoch": 0.0681597584211094, + "loss/policy_avg": 1.1613974571228027, + "lr": 9.546267893660532e-06, + "objective/entropy": -39.009971618652344, + "objective/kl": 47.15522384643555, + "objective/non_score_reward": -2.3577613830566406, + "objective/rlhf_reward": -9.431045532226562, + "objective/scores": 0.0, + "policy/approxkl_avg": 71.2586669921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.58203125, + "step": 710, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9961512088775635 + }, + { + "episode": 11392, + "epoch": 0.06825562306023894, + "loss/policy_avg": 0.020869355648756027, + "lr": 9.545628834355829e-06, + "objective/entropy": 89.85824584960938, + "objective/kl": 47.41434097290039, + "objective/non_score_reward": -2.3707170486450195, + "objective/rlhf_reward": -8.082868432998657, + "objective/scores": 0.35, + "policy/approxkl_avg": 56.31502914428711, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.517578125, + "step": 711, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000457525253296 + }, + { + "episode": 11408, + "epoch": 0.0683514876993685, + "loss/policy_avg": 0.5365210771560669, + "lr": 9.544989775051126e-06, + "objective/entropy": 100.53746795654297, + "objective/kl": 41.074005126953125, + "objective/non_score_reward": -2.0537002086639404, + "objective/rlhf_reward": -10.214800834655762, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.81124210357666, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.705078125, + "step": 712, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0024473667144775 + }, + { + "episode": 11424, + "epoch": 0.06844735233849804, + "loss/policy_avg": 0.09845000505447388, + "lr": 9.544350715746423e-06, + "objective/entropy": 23.6993408203125, + "objective/kl": 43.17427062988281, + "objective/non_score_reward": -2.1587133407592773, + "objective/rlhf_reward": -4.234853601455688, + "objective/scores": 1.1, + "policy/approxkl_avg": 50.906639099121094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.404296875, + "step": 713, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974209070205688 + }, + { + "episode": 11440, + "epoch": 0.0685432169776276, + "loss/policy_avg": 0.6106125712394714, + "lr": 9.543711656441718e-06, + "objective/entropy": -79.2982177734375, + "objective/kl": 54.40810775756836, + "objective/non_score_reward": -2.7204058170318604, + "objective/rlhf_reward": -9.365851247104342, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 135.19461059570312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.548828125, + "step": 714, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976290464401245 + }, + { + "episode": 11456, + "epoch": 0.06863908161675714, + "loss/policy_avg": 0.574759840965271, + "lr": 9.543072597137015e-06, + "objective/entropy": 162.13650512695312, + "objective/kl": 39.636680603027344, + "objective/non_score_reward": -1.9818341732025146, + "objective/rlhf_reward": -9.927336692810059, + "objective/scores": -0.5, + "policy/approxkl_avg": 106.59353637695312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.70703125, + "step": 715, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982962608337402 + }, + { + "episode": 11472, + "epoch": 0.06873494625588669, + "loss/policy_avg": 0.25162333250045776, + "lr": 9.542433537832312e-06, + "objective/entropy": 151.61593627929688, + "objective/kl": 30.118364334106445, + "objective/non_score_reward": -1.505918264389038, + "objective/rlhf_reward": -3.6236731171607968, + "objective/scores": 0.6, + "policy/approxkl_avg": 27.697843551635742, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.56640625, + "step": 716, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997877597808838 + }, + { + "episode": 11488, + "epoch": 0.06883081089501623, + "loss/policy_avg": 1.1173698902130127, + "lr": 9.541794478527609e-06, + "objective/entropy": -134.95767211914062, + "objective/kl": 43.572898864746094, + "objective/non_score_reward": -2.178645133972168, + "objective/rlhf_reward": -4.314580178260803, + "objective/scores": 1.1, + "policy/approxkl_avg": 97.25651550292969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.609375, + "step": 717, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9965053796768188 + }, + { + "episode": 11504, + "epoch": 0.06892667553414579, + "loss/policy_avg": 0.5863191485404968, + "lr": 9.541155419222906e-06, + "objective/entropy": -156.3623046875, + "objective/kl": 48.465545654296875, + "objective/non_score_reward": -2.4232773780822754, + "objective/rlhf_reward": -8.26927717467126, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 128.9771728515625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.693359375, + "step": 718, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0014121532440186 + }, + { + "episode": 11520, + "epoch": 0.06902254017327533, + "loss/policy_avg": 0.7772470712661743, + "lr": 9.5405163599182e-06, + "objective/entropy": -47.43286895751953, + "objective/kl": 34.12934112548828, + "objective/non_score_reward": -1.7064670324325562, + "objective/rlhf_reward": -6.825868010520935, + "objective/scores": 0.0, + "policy/approxkl_avg": 38.218685150146484, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.658203125, + "step": 719, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998382329940796 + }, + { + "episode": 11536, + "epoch": 0.06911840481240489, + "loss/policy_avg": 0.10698091238737106, + "lr": 9.539877300613498e-06, + "objective/entropy": -83.33969116210938, + "objective/kl": 44.03644561767578, + "objective/non_score_reward": -2.201822280883789, + "objective/rlhf_reward": -10.807289123535156, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.700538635253906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.64453125, + "step": 720, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0018362998962402 + }, + { + "episode": 11552, + "epoch": 0.06921426945153443, + "loss/policy_avg": 0.020285863429307938, + "lr": 9.539238241308795e-06, + "objective/entropy": 29.211990356445312, + "objective/kl": 54.56703567504883, + "objective/non_score_reward": -2.7283518314361572, + "objective/rlhf_reward": -12.913407325744629, + "objective/scores": -0.5, + "policy/approxkl_avg": 238.79959106445312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.482421875, + "step": 721, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989087581634521 + }, + { + "episode": 11568, + "epoch": 0.06931013409066399, + "loss/policy_avg": -0.5622311234474182, + "lr": 9.538599182004091e-06, + "objective/entropy": -18.523269653320312, + "objective/kl": 61.245567321777344, + "objective/non_score_reward": -3.0622785091400146, + "objective/rlhf_reward": -12.2491135597229, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.92283821105957, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6015625, + "step": 722, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001999855041504 + }, + { + "episode": 11584, + "epoch": 0.06940599872979353, + "loss/policy_avg": 0.278200626373291, + "lr": 9.537960122699387e-06, + "objective/entropy": 62.370330810546875, + "objective/kl": 35.979461669921875, + "objective/non_score_reward": -1.7989730834960938, + "objective/rlhf_reward": -7.195892691612244, + "objective/scores": 0.0, + "policy/approxkl_avg": 171.5391845703125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.71875, + "step": 723, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976105690002441 + }, + { + "episode": 11600, + "epoch": 0.06950186336892308, + "loss/policy_avg": -0.001310823718085885, + "lr": 9.537321063394683e-06, + "objective/entropy": -119.83037567138672, + "objective/kl": 38.98551940917969, + "objective/non_score_reward": -1.949276089668274, + "objective/rlhf_reward": -4.873385463596556, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 30.58733367919922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5546875, + "step": 724, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.003657817840576 + }, + { + "episode": 11616, + "epoch": 0.06959772800805263, + "loss/policy_avg": 0.4587482213973999, + "lr": 9.53668200408998e-06, + "objective/entropy": 136.93392944335938, + "objective/kl": 47.94542694091797, + "objective/non_score_reward": -2.397271156311035, + "objective/rlhf_reward": -11.58908462524414, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.060930252075195, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.82421875, + "step": 725, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974181652069092 + }, + { + "episode": 11632, + "epoch": 0.06969359264718218, + "loss/policy_avg": 0.12928886711597443, + "lr": 9.536042944785277e-06, + "objective/entropy": 1.9888534545898438, + "objective/kl": 35.45054626464844, + "objective/non_score_reward": -1.7725270986557007, + "objective/rlhf_reward": -4.6901086330413815, + "objective/scores": 0.6, + "policy/approxkl_avg": 118.1571273803711, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.630859375, + "step": 726, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984965324401855 + }, + { + "episode": 11648, + "epoch": 0.06978945728631172, + "loss/policy_avg": 0.188452810049057, + "lr": 9.535403885480572e-06, + "objective/entropy": 6.331550598144531, + "objective/kl": 40.90258026123047, + "objective/non_score_reward": -2.04512882232666, + "objective/rlhf_reward": -6.699562790806651, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 5.804554462432861, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4677734375, + "step": 727, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999557614326477 + }, + { + "episode": 11664, + "epoch": 0.06988532192544128, + "loss/policy_avg": -0.05883468687534332, + "lr": 9.53476482617587e-06, + "objective/entropy": -6.368316650390625, + "objective/kl": 48.47132110595703, + "objective/non_score_reward": -2.4235661029815674, + "objective/rlhf_reward": -9.69426441192627, + "objective/scores": 0.0, + "policy/approxkl_avg": 45.2496337890625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4423828125, + "step": 728, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0150885581970215 + }, + { + "episode": 11680, + "epoch": 0.06998118656457082, + "loss/policy_avg": 0.5350899696350098, + "lr": 9.534125766871166e-06, + "objective/entropy": -111.3927001953125, + "objective/kl": 37.624053955078125, + "objective/non_score_reward": -1.8812024593353271, + "objective/rlhf_reward": -9.524810791015625, + "objective/scores": -0.5, + "policy/approxkl_avg": 141.84805297851562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.564453125, + "step": 729, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9961867332458496 + }, + { + "episode": 11696, + "epoch": 0.07007705120370038, + "loss/policy_avg": 0.385601669549942, + "lr": 9.533486707566463e-06, + "objective/entropy": -71.52942657470703, + "objective/kl": 56.019927978515625, + "objective/non_score_reward": -2.8009965419769287, + "objective/rlhf_reward": -9.780154068668452, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 188.1399383544922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.54296875, + "step": 730, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999030828475952 + }, + { + "episode": 11712, + "epoch": 0.07017291584282992, + "loss/policy_avg": 0.4735555350780487, + "lr": 9.53284764826176e-06, + "objective/entropy": -49.76824188232422, + "objective/kl": 39.93510437011719, + "objective/non_score_reward": -1.9967552423477173, + "objective/rlhf_reward": -7.987020969390869, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.2408503293991089, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69921875, + "step": 731, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000382900238037 + }, + { + "episode": 11728, + "epoch": 0.07026878048195948, + "loss/policy_avg": -0.07018261402845383, + "lr": 9.532208588957055e-06, + "objective/entropy": 101.4871597290039, + "objective/kl": 61.77082443237305, + "objective/non_score_reward": -3.0885415077209473, + "objective/rlhf_reward": -12.35416555404663, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.342255592346191, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.728515625, + "step": 732, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993245601654053 + }, + { + "episode": 11744, + "epoch": 0.07036464512108902, + "loss/policy_avg": 0.18908607959747314, + "lr": 9.531569529652352e-06, + "objective/entropy": -35.277099609375, + "objective/kl": 51.8348274230957, + "objective/non_score_reward": -2.5917415618896484, + "objective/rlhf_reward": -10.366965770721436, + "objective/scores": 0.0, + "policy/approxkl_avg": 95.693603515625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5390625, + "step": 733, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992425441741943 + }, + { + "episode": 11760, + "epoch": 0.07046050976021857, + "loss/policy_avg": 0.20972387492656708, + "lr": 9.530930470347649e-06, + "objective/entropy": 50.395660400390625, + "objective/kl": 33.1280632019043, + "objective/non_score_reward": -1.6564031839370728, + "objective/rlhf_reward": -5.314931740960479, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 23.05537986755371, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.607421875, + "step": 734, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998901128768921 + }, + { + "episode": 11776, + "epoch": 0.07055637439934812, + "loss/policy_avg": 0.17211434245109558, + "lr": 9.530291411042946e-06, + "objective/entropy": -47.05531311035156, + "objective/kl": 42.41327667236328, + "objective/non_score_reward": -2.12066388130188, + "objective/rlhf_reward": -7.141019752531676, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 103.662109375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4111328125, + "step": 735, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002054214477539 + }, + { + "episode": 11792, + "epoch": 0.07065223903847767, + "loss/policy_avg": 0.3730859160423279, + "lr": 9.529652351738243e-06, + "objective/entropy": 127.4083251953125, + "objective/kl": 33.88222122192383, + "objective/non_score_reward": -1.6941111087799072, + "objective/rlhf_reward": -3.8527253016245098, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 26.343708038330078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.564453125, + "step": 736, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996037483215332 + }, + { + "episode": 11808, + "epoch": 0.07074810367760721, + "loss/policy_avg": 1.6195980310440063, + "lr": 9.52901329243354e-06, + "objective/entropy": 144.70907592773438, + "objective/kl": 56.71342849731445, + "objective/non_score_reward": -2.8356714248657227, + "objective/rlhf_reward": -11.34268581867218, + "objective/scores": 0.0, + "policy/approxkl_avg": 178.33731079101562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.599609375, + "step": 737, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001779556274414 + }, + { + "episode": 11824, + "epoch": 0.07084396831673677, + "loss/policy_avg": 0.23017962276935577, + "lr": 9.528374233128835e-06, + "objective/entropy": 6.576831817626953, + "objective/kl": 54.3745231628418, + "objective/non_score_reward": -2.71872615814209, + "objective/rlhf_reward": -10.874905467033386, + "objective/scores": 0.0, + "policy/approxkl_avg": 127.34964752197266, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47265625, + "step": 738, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999251365661621 + }, + { + "episode": 11840, + "epoch": 0.07093983295586631, + "loss/policy_avg": 0.20700883865356445, + "lr": 9.527735173824132e-06, + "objective/entropy": 68.64032745361328, + "objective/kl": 39.01796340942383, + "objective/non_score_reward": -1.9508981704711914, + "objective/rlhf_reward": -9.803592681884766, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.341182708740234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.556640625, + "step": 739, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999006748199463 + }, + { + "episode": 11856, + "epoch": 0.07103569759499587, + "loss/policy_avg": 0.23299764096736908, + "lr": 9.527096114519428e-06, + "objective/entropy": -73.13887023925781, + "objective/kl": 44.23360061645508, + "objective/non_score_reward": -2.2116799354553223, + "objective/rlhf_reward": -7.242599759165364, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 212.52972412109375, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.53515625, + "step": 740, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007433891296387 + }, + { + "episode": 11872, + "epoch": 0.07113156223412541, + "loss/policy_avg": 0.5644322633743286, + "lr": 9.526457055214725e-06, + "objective/entropy": -129.59765625, + "objective/kl": 45.14954376220703, + "objective/non_score_reward": -2.257477283477783, + "objective/rlhf_reward": -11.029909133911133, + "objective/scores": -0.5, + "policy/approxkl_avg": 266.2381591796875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49609375, + "step": 741, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0021135807037354 + }, + { + "episode": 11888, + "epoch": 0.07122742687325496, + "loss/policy_avg": 0.2687217891216278, + "lr": 9.525817995910022e-06, + "objective/entropy": -3.992889404296875, + "objective/kl": 43.55207061767578, + "objective/non_score_reward": -2.1776037216186523, + "objective/rlhf_reward": -4.310415005683899, + "objective/scores": 1.1, + "policy/approxkl_avg": 29.35465431213379, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.74609375, + "step": 742, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996775388717651 + }, + { + "episode": 11904, + "epoch": 0.0713232915123845, + "loss/policy_avg": 0.4181610643863678, + "lr": 9.525178936605317e-06, + "objective/entropy": -113.56121826171875, + "objective/kl": 56.03525161743164, + "objective/non_score_reward": -2.801762819290161, + "objective/rlhf_reward": -9.896369805535674, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 59.689788818359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 743, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0011415481567383 + }, + { + "episode": 11920, + "epoch": 0.07141915615151406, + "loss/policy_avg": 0.20894041657447815, + "lr": 9.524539877300614e-06, + "objective/entropy": -11.158370971679688, + "objective/kl": 53.116886138916016, + "objective/non_score_reward": -2.6558444499969482, + "objective/rlhf_reward": -12.623377799987793, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.38343811035156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4951171875, + "step": 744, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0001273155212402 + }, + { + "episode": 11936, + "epoch": 0.0715150207906436, + "loss/policy_avg": 0.998124897480011, + "lr": 9.52390081799591e-06, + "objective/entropy": 38.680179595947266, + "objective/kl": 52.27276611328125, + "objective/non_score_reward": -2.613638162612915, + "objective/rlhf_reward": -10.454552412033081, + "objective/scores": 0.0, + "policy/approxkl_avg": 165.6784210205078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.466796875, + "step": 745, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996938943862915 + }, + { + "episode": 11952, + "epoch": 0.07161088542977316, + "loss/policy_avg": 0.3075593113899231, + "lr": 9.523261758691206e-06, + "objective/entropy": 94.68702697753906, + "objective/kl": 41.677528381347656, + "objective/non_score_reward": -2.083876609802246, + "objective/rlhf_reward": -10.335506439208984, + "objective/scores": -0.5, + "policy/approxkl_avg": 88.00906372070312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55078125, + "step": 746, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0009684562683105 + }, + { + "episode": 11968, + "epoch": 0.0717067500689027, + "loss/policy_avg": 0.1599944531917572, + "lr": 9.522622699386503e-06, + "objective/entropy": -85.20484924316406, + "objective/kl": 29.781755447387695, + "objective/non_score_reward": -1.4890878200531006, + "objective/rlhf_reward": -3.833644928709541, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 70.03274536132812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5625, + "step": 747, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996821641921997 + }, + { + "episode": 11984, + "epoch": 0.07180261470803226, + "loss/policy_avg": -0.09249274432659149, + "lr": 9.5219836400818e-06, + "objective/entropy": 71.3521957397461, + "objective/kl": 70.05450439453125, + "objective/non_score_reward": -3.502725124359131, + "objective/rlhf_reward": -14.010900735855103, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.482736587524414, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.59375, + "step": 748, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002081871032715 + }, + { + "episode": 12000, + "epoch": 0.07189847934716181, + "loss/policy_avg": 0.3381286561489105, + "lr": 9.521344580777097e-06, + "objective/entropy": 22.815597534179688, + "objective/kl": 32.743553161621094, + "objective/non_score_reward": -1.6371777057647705, + "objective/rlhf_reward": -4.426004471556221, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 11.06489372253418, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.494140625, + "step": 749, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998692512512207 + }, + { + "episode": 12016, + "epoch": 0.07199434398629136, + "loss/policy_avg": 0.5579380989074707, + "lr": 9.520705521472394e-06, + "objective/entropy": 159.58346557617188, + "objective/kl": 50.72445297241211, + "objective/non_score_reward": -2.536222457885742, + "objective/rlhf_reward": -12.144889831542969, + "objective/scores": -0.5, + "policy/approxkl_avg": 61.53816223144531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.80078125, + "step": 750, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9964263439178467 + }, + { + "episode": 12032, + "epoch": 0.07209020862542091, + "loss/policy_avg": 0.3575834333896637, + "lr": 9.520066462167689e-06, + "objective/entropy": 167.10739135742188, + "objective/kl": 56.76179122924805, + "objective/non_score_reward": -2.838089942932129, + "objective/rlhf_reward": -13.3523588180542, + "objective/scores": -0.5, + "policy/approxkl_avg": 89.87391662597656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4951171875, + "step": 751, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999544620513916 + }, + { + "episode": 12048, + "epoch": 0.07218607326455045, + "loss/policy_avg": 0.405551552772522, + "lr": 9.519427402862986e-06, + "objective/entropy": 165.36419677734375, + "objective/kl": 47.32508087158203, + "objective/non_score_reward": -2.3662538528442383, + "objective/rlhf_reward": -9.465015769004822, + "objective/scores": 0.0, + "policy/approxkl_avg": 77.1661376953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.52734375, + "step": 752, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999945163726807 + }, + { + "episode": 12064, + "epoch": 0.07228193790368001, + "loss/policy_avg": 0.959913432598114, + "lr": 9.518788343558283e-06, + "objective/entropy": 10.122795104980469, + "objective/kl": 46.78593444824219, + "objective/non_score_reward": -2.339296817779541, + "objective/rlhf_reward": -11.357187271118164, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.604637145996094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.63671875, + "step": 753, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99744713306427 + }, + { + "episode": 12080, + "epoch": 0.07237780254280955, + "loss/policy_avg": 0.12096801400184631, + "lr": 9.51814928425358e-06, + "objective/entropy": -26.37040901184082, + "objective/kl": 48.100013732910156, + "objective/non_score_reward": -2.405000686645508, + "objective/rlhf_reward": -11.620002746582031, + "objective/scores": -0.5, + "policy/approxkl_avg": 50.8070068359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.533203125, + "step": 754, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9955406188964844 + }, + { + "episode": 12096, + "epoch": 0.07247366718193911, + "loss/policy_avg": -0.3492419123649597, + "lr": 9.517510224948877e-06, + "objective/entropy": 189.5272674560547, + "objective/kl": 50.922386169433594, + "objective/non_score_reward": -2.546119451522827, + "objective/rlhf_reward": -10.18447756767273, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.345268726348877, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.578125, + "step": 755, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0023841857910156 + }, + { + "episode": 12112, + "epoch": 0.07256953182106865, + "loss/policy_avg": 0.051233310252428055, + "lr": 9.516871165644172e-06, + "objective/entropy": 52.45849609375, + "objective/kl": 37.70225143432617, + "objective/non_score_reward": -1.8851127624511719, + "objective/rlhf_reward": -7.5404510498046875, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.9853553771972656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7109375, + "step": 756, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0019936561584473 + }, + { + "episode": 12128, + "epoch": 0.0726653964601982, + "loss/policy_avg": 0.07707858085632324, + "lr": 9.516232106339469e-06, + "objective/entropy": 62.76935577392578, + "objective/kl": 45.75785827636719, + "objective/non_score_reward": -2.2878928184509277, + "objective/rlhf_reward": -9.151571869850159, + "objective/scores": 0.0, + "policy/approxkl_avg": 110.3807373046875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.541015625, + "step": 757, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99611234664917 + }, + { + "episode": 12144, + "epoch": 0.07276126109932775, + "loss/policy_avg": 1.5336410999298096, + "lr": 9.515593047034765e-06, + "objective/entropy": -170.8246307373047, + "objective/kl": 41.72396469116211, + "objective/non_score_reward": -2.086198329925537, + "objective/rlhf_reward": -7.019280109435243, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 34.23369598388672, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.505859375, + "step": 758, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984989166259766 + }, + { + "episode": 12160, + "epoch": 0.0728571257384573, + "loss/policy_avg": 0.043852031230926514, + "lr": 9.514953987730062e-06, + "objective/entropy": 137.45498657226562, + "objective/kl": 48.40650177001953, + "objective/non_score_reward": -2.42032527923584, + "objective/rlhf_reward": -9.68130075931549, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.558115005493164, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4189453125, + "step": 759, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002941608428955 + }, + { + "episode": 12176, + "epoch": 0.07295299037758685, + "loss/policy_avg": 0.4132700562477112, + "lr": 9.51431492842536e-06, + "objective/entropy": 18.133056640625, + "objective/kl": 48.084434509277344, + "objective/non_score_reward": -2.404221773147583, + "objective/rlhf_reward": -11.616887092590332, + "objective/scores": -0.5, + "policy/approxkl_avg": 290.97406005859375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.609375, + "step": 760, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9965276718139648 + }, + { + "episode": 12192, + "epoch": 0.0730488550167164, + "loss/policy_avg": 0.9889291524887085, + "lr": 9.513675869120656e-06, + "objective/entropy": -87.85621643066406, + "objective/kl": 40.44783401489258, + "objective/non_score_reward": -2.0223917961120605, + "objective/rlhf_reward": -6.427707438886749, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 5.214938163757324, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.525390625, + "step": 761, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0102572441101074 + }, + { + "episode": 12208, + "epoch": 0.07314471965584594, + "loss/policy_avg": 0.23898151516914368, + "lr": 9.513036809815951e-06, + "objective/entropy": -116.84449005126953, + "objective/kl": 40.57633972167969, + "objective/non_score_reward": -2.0288169384002686, + "objective/rlhf_reward": -10.115267753601074, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.82298278808594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.544921875, + "step": 762, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998227596282959 + }, + { + "episode": 12224, + "epoch": 0.0732405842949755, + "loss/policy_avg": 0.09686745703220367, + "lr": 9.512397750511248e-06, + "objective/entropy": -108.70967864990234, + "objective/kl": 39.81999969482422, + "objective/non_score_reward": -1.9909999370574951, + "objective/rlhf_reward": -6.4482279656254615, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 81.55473327636719, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65234375, + "step": 763, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988638162612915 + }, + { + "episode": 12240, + "epoch": 0.07333644893410504, + "loss/policy_avg": -0.09598994255065918, + "lr": 9.511758691206545e-06, + "objective/entropy": 57.47736740112305, + "objective/kl": 43.770423889160156, + "objective/non_score_reward": -2.188521146774292, + "objective/rlhf_reward": -10.754084587097168, + "objective/scores": -0.5, + "policy/approxkl_avg": 53.769474029541016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.53515625, + "step": 764, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0052638053894043 + }, + { + "episode": 12256, + "epoch": 0.0734323135732346, + "loss/policy_avg": 0.47687628865242004, + "lr": 9.511119631901842e-06, + "objective/entropy": -109.64041900634766, + "objective/kl": 33.36266326904297, + "objective/non_score_reward": -1.6681331396102905, + "objective/rlhf_reward": -5.068412694994526, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 49.52349090576172, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5625, + "step": 765, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998824119567871 + }, + { + "episode": 12272, + "epoch": 0.07352817821236414, + "loss/policy_avg": 2.4735023975372314, + "lr": 9.510480572597139e-06, + "objective/entropy": 28.396625518798828, + "objective/kl": 55.17261505126953, + "objective/non_score_reward": -2.7586307525634766, + "objective/rlhf_reward": -11.034523010253906, + "objective/scores": 0.0, + "policy/approxkl_avg": 344.6340637207031, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.79296875, + "step": 766, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982433319091797 + }, + { + "episode": 12288, + "epoch": 0.0736240428514937, + "loss/policy_avg": 0.5203840732574463, + "lr": 9.509841513292434e-06, + "objective/entropy": 64.80726623535156, + "objective/kl": 50.50967788696289, + "objective/non_score_reward": -2.5254838466644287, + "objective/rlhf_reward": -8.36860205332438, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 2.1747188568115234, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.443359375, + "step": 767, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0021090507507324 + }, + { + "episode": 12304, + "epoch": 0.07371990749062324, + "loss/policy_avg": 0.4155661165714264, + "lr": 9.509202453987731e-06, + "objective/entropy": -173.56729125976562, + "objective/kl": 25.973976135253906, + "objective/non_score_reward": -1.2986987829208374, + "objective/rlhf_reward": -3.884114315709472, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 44.03254699707031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.63671875, + "step": 768, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984633922576904 + }, + { + "episode": 12320, + "epoch": 0.0738157721297528, + "loss/policy_avg": 0.3945726156234741, + "lr": 9.508563394683026e-06, + "objective/entropy": -129.74560546875, + "objective/kl": 34.20870590209961, + "objective/non_score_reward": -1.7104352712631226, + "objective/rlhf_reward": -8.841741561889648, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.454164505004883, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.578125, + "step": 769, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000208616256714 + }, + { + "episode": 12336, + "epoch": 0.07391163676888234, + "loss/policy_avg": 0.8236256837844849, + "lr": 9.507924335378323e-06, + "objective/entropy": 41.75614929199219, + "objective/kl": 60.54899215698242, + "objective/non_score_reward": -3.027449369430542, + "objective/rlhf_reward": -10.685965378482905, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 10.848055839538574, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4013671875, + "step": 770, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974124431610107 + }, + { + "episode": 12352, + "epoch": 0.07400750140801189, + "loss/policy_avg": 0.19437658786773682, + "lr": 9.50728527607362e-06, + "objective/entropy": -209.1417236328125, + "objective/kl": 39.18832015991211, + "objective/non_score_reward": -1.9594159126281738, + "objective/rlhf_reward": -7.837664008140564, + "objective/scores": 0.0, + "policy/approxkl_avg": 37.941341400146484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 771, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991120100021362 + }, + { + "episode": 12368, + "epoch": 0.07410336604714143, + "loss/policy_avg": 0.186931774020195, + "lr": 9.506646216768917e-06, + "objective/entropy": 99.30989074707031, + "objective/kl": 54.228919982910156, + "objective/non_score_reward": -2.7114462852478027, + "objective/rlhf_reward": -9.364832046444773, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 147.09829711914062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.52734375, + "step": 772, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0015740394592285 + }, + { + "episode": 12384, + "epoch": 0.07419923068627099, + "loss/policy_avg": 0.5306086540222168, + "lr": 9.506007157464214e-06, + "objective/entropy": 181.44981384277344, + "objective/kl": 39.18826675415039, + "objective/non_score_reward": -1.9594132900238037, + "objective/rlhf_reward": -7.837653160095215, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.374238014221191, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.787109375, + "step": 773, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995557069778442 + }, + { + "episode": 12400, + "epoch": 0.07429509532540053, + "loss/policy_avg": -0.052044857293367386, + "lr": 9.50536809815951e-06, + "objective/entropy": 135.13133239746094, + "objective/kl": 59.5172119140625, + "objective/non_score_reward": -2.975860595703125, + "objective/rlhf_reward": -10.452844838710174, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 162.56112670898438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 774, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.006500244140625 + }, + { + "episode": 12416, + "epoch": 0.07439095996453009, + "loss/policy_avg": 0.1924789994955063, + "lr": 9.504729038854806e-06, + "objective/entropy": -17.830699920654297, + "objective/kl": 51.66797637939453, + "objective/non_score_reward": -2.5833988189697266, + "objective/rlhf_reward": -10.333595514297485, + "objective/scores": 0.0, + "policy/approxkl_avg": 46.85808563232422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3525390625, + "step": 775, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9978249073028564 + }, + { + "episode": 12432, + "epoch": 0.07448682460365963, + "loss/policy_avg": 0.37696969509124756, + "lr": 9.504089979550103e-06, + "objective/entropy": -98.51216125488281, + "objective/kl": 45.98451614379883, + "objective/non_score_reward": -2.2992258071899414, + "objective/rlhf_reward": -11.196903228759766, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.452768325805664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.685546875, + "step": 776, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998502492904663 + }, + { + "episode": 12448, + "epoch": 0.07458268924278919, + "loss/policy_avg": 0.2955755591392517, + "lr": 9.5034509202454e-06, + "objective/entropy": 28.05120849609375, + "objective/kl": 32.432106018066406, + "objective/non_score_reward": -1.621605396270752, + "objective/rlhf_reward": -5.062589605053035, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 0.6783496141433716, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3388671875, + "step": 777, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00238299369812 + }, + { + "episode": 12464, + "epoch": 0.07467855388191873, + "loss/policy_avg": 0.13018612563610077, + "lr": 9.502811860940696e-06, + "objective/entropy": -24.786895751953125, + "objective/kl": 50.896385192871094, + "objective/non_score_reward": -2.5448193550109863, + "objective/rlhf_reward": -12.179277420043945, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.602583885192871, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.615234375, + "step": 778, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000718355178833 + }, + { + "episode": 12480, + "epoch": 0.07477441852104828, + "loss/policy_avg": 0.0673021525144577, + "lr": 9.502172801635993e-06, + "objective/entropy": 70.9556884765625, + "objective/kl": 35.40740203857422, + "objective/non_score_reward": -1.7703702449798584, + "objective/rlhf_reward": -7.081481099128723, + "objective/scores": 0.0, + "policy/approxkl_avg": 44.054901123046875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.59765625, + "step": 779, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.004899024963379 + }, + { + "episode": 12496, + "epoch": 0.07487028316017783, + "loss/policy_avg": 0.13619756698608398, + "lr": 9.50153374233129e-06, + "objective/entropy": -115.29307556152344, + "objective/kl": 29.000293731689453, + "objective/non_score_reward": -1.450014591217041, + "objective/rlhf_reward": -7.800058364868164, + "objective/scores": -0.5, + "policy/approxkl_avg": 20.81727409362793, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4912109375, + "step": 780, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995115995407104 + }, + { + "episode": 12512, + "epoch": 0.07496614779930738, + "loss/policy_avg": 0.5143172740936279, + "lr": 9.500894683026585e-06, + "objective/entropy": 7.5710601806640625, + "objective/kl": 35.72896957397461, + "objective/non_score_reward": -1.7864482402801514, + "objective/rlhf_reward": -5.023087205664192, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 14.312458038330078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6484375, + "step": 781, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997058629989624 + }, + { + "episode": 12528, + "epoch": 0.07506201243843692, + "loss/policy_avg": 0.8229461908340454, + "lr": 9.500255623721882e-06, + "objective/entropy": 54.34223556518555, + "objective/kl": 58.44230651855469, + "objective/non_score_reward": -2.9221153259277344, + "objective/rlhf_reward": -11.688461184501648, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.381330490112305, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.337890625, + "step": 782, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989904165267944 + }, + { + "episode": 12544, + "epoch": 0.07515787707756648, + "loss/policy_avg": 0.44549697637557983, + "lr": 9.499616564417179e-06, + "objective/entropy": 24.598159790039062, + "objective/kl": 50.54734420776367, + "objective/non_score_reward": -2.52736759185791, + "objective/rlhf_reward": -10.109470129013062, + "objective/scores": 0.0, + "policy/approxkl_avg": 51.31697082519531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4775390625, + "step": 783, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9968284368515015 + }, + { + "episode": 12560, + "epoch": 0.07525374171669602, + "loss/policy_avg": -0.03360733389854431, + "lr": 9.498977505112476e-06, + "objective/entropy": 33.11079025268555, + "objective/kl": 33.97381591796875, + "objective/non_score_reward": -1.6986910104751587, + "objective/rlhf_reward": -5.190644059244709, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 67.89860534667969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5625, + "step": 784, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986311197280884 + }, + { + "episode": 12576, + "epoch": 0.07534960635582558, + "loss/policy_avg": 0.3447728753089905, + "lr": 9.498338445807773e-06, + "objective/entropy": -7.788688659667969, + "objective/kl": 42.518123626708984, + "objective/non_score_reward": -2.125906229019165, + "objective/rlhf_reward": -7.144375049804134, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 37.0943603515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.41796875, + "step": 785, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997989296913147 + }, + { + "episode": 12592, + "epoch": 0.07544547099495512, + "loss/policy_avg": 0.016856741160154343, + "lr": 9.497699386503068e-06, + "objective/entropy": -182.511962890625, + "objective/kl": 34.09797286987305, + "objective/non_score_reward": -1.704898715019226, + "objective/rlhf_reward": -6.819594740867615, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.2245066165924072, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.66796875, + "step": 786, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994986057281494 + }, + { + "episode": 12608, + "epoch": 0.07554133563408467, + "loss/policy_avg": 0.728520393371582, + "lr": 9.497060327198365e-06, + "objective/entropy": -62.65592956542969, + "objective/kl": 41.837890625, + "objective/non_score_reward": -2.0918943881988525, + "objective/rlhf_reward": -10.36757755279541, + "objective/scores": -0.5, + "policy/approxkl_avg": 19.051132202148438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8125, + "step": 787, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.99742591381073 + }, + { + "episode": 12624, + "epoch": 0.07563720027321422, + "loss/policy_avg": 0.20458482205867767, + "lr": 9.496421267893662e-06, + "objective/entropy": 146.9737548828125, + "objective/kl": 39.61537170410156, + "objective/non_score_reward": -1.9807686805725098, + "objective/rlhf_reward": -6.563825094436092, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 21.44344711303711, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.75, + "step": 788, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979448318481445 + }, + { + "episode": 12640, + "epoch": 0.07573306491234377, + "loss/policy_avg": 0.0014098212122917175, + "lr": 9.495782208588959e-06, + "objective/entropy": -0.07474517822265625, + "objective/kl": 44.36164855957031, + "objective/non_score_reward": -2.2180824279785156, + "objective/rlhf_reward": -5.948611055256102, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 6.881351470947266, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.70703125, + "step": 789, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984344244003296 + }, + { + "episode": 12656, + "epoch": 0.07582892955147331, + "loss/policy_avg": 0.6741665601730347, + "lr": 9.495143149284254e-06, + "objective/entropy": 76.89515686035156, + "objective/kl": 46.9569091796875, + "objective/non_score_reward": -2.3478455543518066, + "objective/rlhf_reward": -7.967550356586543, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 135.48193359375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65234375, + "step": 790, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995501041412354 + }, + { + "episode": 12672, + "epoch": 0.07592479419060287, + "loss/policy_avg": 0.19035130739212036, + "lr": 9.49450408997955e-06, + "objective/entropy": 161.73770141601562, + "objective/kl": 38.28052520751953, + "objective/non_score_reward": -1.9140264987945557, + "objective/rlhf_reward": -7.656105995178223, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.833908081054688, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.744140625, + "step": 791, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978740215301514 + }, + { + "episode": 12688, + "epoch": 0.07602065882973241, + "loss/policy_avg": 0.6477119326591492, + "lr": 9.493865030674848e-06, + "objective/entropy": -76.65821838378906, + "objective/kl": 46.43013000488281, + "objective/non_score_reward": -2.3215065002441406, + "objective/rlhf_reward": -9.286026120185852, + "objective/scores": 0.0, + "policy/approxkl_avg": 42.871192932128906, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.67578125, + "step": 792, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9972238540649414 + }, + { + "episode": 12704, + "epoch": 0.07611652346886197, + "loss/policy_avg": 0.5234931707382202, + "lr": 9.493225971370144e-06, + "objective/entropy": -55.55887985229492, + "objective/kl": 58.243072509765625, + "objective/non_score_reward": -2.912153482437134, + "objective/rlhf_reward": -11.648614168167114, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.850006103515625, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.439453125, + "step": 793, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0001378059387207 + }, + { + "episode": 12720, + "epoch": 0.07621238810799151, + "loss/policy_avg": 0.07076121866703033, + "lr": 9.49258691206544e-06, + "objective/entropy": -71.87374114990234, + "objective/kl": 40.136268615722656, + "objective/non_score_reward": -2.0068135261535645, + "objective/rlhf_reward": -10.027254104614258, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.8504383563995361, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.53125, + "step": 794, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992156028747559 + }, + { + "episode": 12736, + "epoch": 0.07630825274712107, + "loss/policy_avg": 0.17811261117458344, + "lr": 9.491947852760736e-06, + "objective/entropy": 131.82073974609375, + "objective/kl": 40.94823455810547, + "objective/non_score_reward": -2.0474116802215576, + "objective/rlhf_reward": -8.189646244049072, + "objective/scores": 0.0, + "policy/approxkl_avg": 75.01351928710938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4521484375, + "step": 795, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998131513595581 + }, + { + "episode": 12752, + "epoch": 0.07640411738625061, + "loss/policy_avg": 0.6544677019119263, + "lr": 9.491308793456033e-06, + "objective/entropy": -84.80082702636719, + "objective/kl": 47.815101623535156, + "objective/non_score_reward": -2.3907551765441895, + "objective/rlhf_reward": -11.563020706176758, + "objective/scores": -0.5, + "policy/approxkl_avg": 19.099197387695312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.60546875, + "step": 796, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984261989593506 + }, + { + "episode": 12768, + "epoch": 0.07649998202538016, + "loss/policy_avg": 0.22662004828453064, + "lr": 9.49066973415133e-06, + "objective/entropy": -86.99248504638672, + "objective/kl": 39.44972229003906, + "objective/non_score_reward": -1.9724860191345215, + "objective/rlhf_reward": -9.889944076538086, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.641038417816162, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.580078125, + "step": 797, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0072741508483887 + }, + { + "episode": 12784, + "epoch": 0.0765958466645097, + "loss/policy_avg": -0.010378673672676086, + "lr": 9.490030674846627e-06, + "objective/entropy": 240.79409790039062, + "objective/kl": 29.289142608642578, + "objective/non_score_reward": -1.4644571542739868, + "objective/rlhf_reward": -7.857828140258789, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.18465805053711, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 1.0078125, + "step": 798, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0018177032470703 + }, + { + "episode": 12800, + "epoch": 0.07669171130363926, + "loss/policy_avg": 0.24487555027008057, + "lr": 9.489391615541922e-06, + "objective/entropy": -22.301956176757812, + "objective/kl": 49.77181625366211, + "objective/non_score_reward": -2.488590955734253, + "objective/rlhf_reward": -9.954363465309143, + "objective/scores": 0.0, + "policy/approxkl_avg": 51.248008728027344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4873046875, + "step": 799, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9972162246704102 + }, + { + "episode": 12816, + "epoch": 0.0767875759427688, + "loss/policy_avg": 0.4941561222076416, + "lr": 9.488752556237219e-06, + "objective/entropy": -19.4157772064209, + "objective/kl": 43.29069519042969, + "objective/non_score_reward": -2.164534568786621, + "objective/rlhf_reward": -8.658138275146484, + "objective/scores": 0.0, + "policy/approxkl_avg": 179.27517700195312, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.580078125, + "step": 800, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980266094207764 + }, + { + "episode": 12832, + "epoch": 0.07688344058189836, + "loss/policy_avg": 0.2502278685569763, + "lr": 9.488113496932516e-06, + "objective/entropy": 73.13986206054688, + "objective/kl": 49.88857650756836, + "objective/non_score_reward": -2.4944286346435547, + "objective/rlhf_reward": -9.977714776992798, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.657133102416992, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.564453125, + "step": 801, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984219074249268 + }, + { + "episode": 12848, + "epoch": 0.0769793052210279, + "loss/policy_avg": 0.47350820899009705, + "lr": 9.487474437627813e-06, + "objective/entropy": -24.693031311035156, + "objective/kl": 45.92048645019531, + "objective/non_score_reward": -2.2960243225097656, + "objective/rlhf_reward": -11.184097290039062, + "objective/scores": -0.5, + "policy/approxkl_avg": 32.06195068359375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 802, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986913204193115 + }, + { + "episode": 12864, + "epoch": 0.07707516986015746, + "loss/policy_avg": -0.07447345554828644, + "lr": 9.48683537832311e-06, + "objective/entropy": 7.078838348388672, + "objective/kl": 45.001708984375, + "objective/non_score_reward": -2.2500853538513184, + "objective/rlhf_reward": -11.000341415405273, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.8471827507019043, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4248046875, + "step": 803, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995155334472656 + }, + { + "episode": 12880, + "epoch": 0.077171034499287, + "loss/policy_avg": 0.4310092031955719, + "lr": 9.486196319018407e-06, + "objective/entropy": 60.94478225708008, + "objective/kl": 50.807186126708984, + "objective/non_score_reward": -2.5403592586517334, + "objective/rlhf_reward": -12.161437034606934, + "objective/scores": -0.5, + "policy/approxkl_avg": 57.37901306152344, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.56640625, + "step": 804, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978930950164795 + }, + { + "episode": 12896, + "epoch": 0.07726689913841656, + "loss/policy_avg": 0.406343549489975, + "lr": 9.485557259713702e-06, + "objective/entropy": -80.95048522949219, + "objective/kl": 35.10383605957031, + "objective/non_score_reward": -1.755191683769226, + "objective/rlhf_reward": -5.358907466352568, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 70.2693099975586, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.693359375, + "step": 805, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9967843294143677 + }, + { + "episode": 12912, + "epoch": 0.0773627637775461, + "loss/policy_avg": -0.020584769546985626, + "lr": 9.484918200408999e-06, + "objective/entropy": -1.2308731079101562, + "objective/kl": 45.5025634765625, + "objective/non_score_reward": -2.2751283645629883, + "objective/rlhf_reward": -11.100513458251953, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.7406916618347168, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4619140625, + "step": 806, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003175735473633 + }, + { + "episode": 12928, + "epoch": 0.07745862841667565, + "loss/policy_avg": 0.5163513422012329, + "lr": 9.484279141104296e-06, + "objective/entropy": -29.644989013671875, + "objective/kl": 50.42848587036133, + "objective/non_score_reward": -2.5214242935180664, + "objective/rlhf_reward": -10.085696697235107, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8592278957366943, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.583984375, + "step": 807, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0014615058898926 + }, + { + "episode": 12944, + "epoch": 0.07755449305580521, + "loss/policy_avg": 0.13020086288452148, + "lr": 9.483640081799592e-06, + "objective/entropy": -39.59818649291992, + "objective/kl": 37.97582244873047, + "objective/non_score_reward": -1.8987913131713867, + "objective/rlhf_reward": -7.595165014266968, + "objective/scores": 0.0, + "policy/approxkl_avg": 21.444660186767578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.521484375, + "step": 808, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973077774047852 + }, + { + "episode": 12960, + "epoch": 0.07765035769493475, + "loss/policy_avg": -0.20125547051429749, + "lr": 9.48300102249489e-06, + "objective/entropy": 74.72931671142578, + "objective/kl": 44.10024642944336, + "objective/non_score_reward": -2.205012321472168, + "objective/rlhf_reward": -7.441446878997189, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 40.46785354614258, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.62890625, + "step": 809, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997934103012085 + }, + { + "episode": 12976, + "epoch": 0.07774622233406431, + "loss/policy_avg": 0.7126036882400513, + "lr": 9.482361963190185e-06, + "objective/entropy": 190.00839233398438, + "objective/kl": 50.626190185546875, + "objective/non_score_reward": -2.5313093662261963, + "objective/rlhf_reward": -12.125237464904785, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.1100707054138184, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.76953125, + "step": 810, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999626636505127 + }, + { + "episode": 12992, + "epoch": 0.07784208697319385, + "loss/policy_avg": 0.2868719696998596, + "lr": 9.481722903885481e-06, + "objective/entropy": 98.94801330566406, + "objective/kl": 49.62665557861328, + "objective/non_score_reward": -2.481333017349243, + "objective/rlhf_reward": -11.925332069396973, + "objective/scores": -0.5, + "policy/approxkl_avg": 20.311702728271484, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.625, + "step": 811, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970800876617432 + }, + { + "episode": 13008, + "epoch": 0.0779379516123234, + "loss/policy_avg": 0.4926082193851471, + "lr": 9.481083844580777e-06, + "objective/entropy": -47.03901290893555, + "objective/kl": 46.152259826660156, + "objective/non_score_reward": -2.307612895965576, + "objective/rlhf_reward": -11.230451583862305, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.286166191101074, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5625, + "step": 812, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9965301752090454 + }, + { + "episode": 13024, + "epoch": 0.07803381625145295, + "loss/policy_avg": 0.4978470802307129, + "lr": 9.480444785276073e-06, + "objective/entropy": -49.30766296386719, + "objective/kl": 38.764366149902344, + "objective/non_score_reward": -1.9382184743881226, + "objective/rlhf_reward": -7.752874255180359, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.598820686340332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6875, + "step": 813, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000823974609375 + }, + { + "episode": 13040, + "epoch": 0.0781296808905825, + "loss/policy_avg": 1.0125069618225098, + "lr": 9.47980572597137e-06, + "objective/entropy": -71.78276062011719, + "objective/kl": 39.826438903808594, + "objective/non_score_reward": -1.9913220405578613, + "objective/rlhf_reward": -6.639775249987764, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 51.534149169921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.55078125, + "step": 814, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982564449310303 + }, + { + "episode": 13056, + "epoch": 0.07822554552971205, + "loss/policy_avg": 0.2897603511810303, + "lr": 9.479166666666667e-06, + "objective/entropy": -5.348247528076172, + "objective/kl": 52.49787139892578, + "objective/non_score_reward": -2.6248936653137207, + "objective/rlhf_reward": -9.099574184417724, + "objective/scores": 0.35, + "policy/approxkl_avg": 6.661520004272461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.587890625, + "step": 815, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002255916595459 + }, + { + "episode": 13072, + "epoch": 0.0783214101688416, + "loss/policy_avg": 0.17362172901630402, + "lr": 9.478527607361964e-06, + "objective/entropy": -116.49378204345703, + "objective/kl": 46.34797668457031, + "objective/non_score_reward": -2.317399024963379, + "objective/rlhf_reward": -9.269595384597778, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.514812469482422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4716796875, + "step": 816, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992300271987915 + }, + { + "episode": 13088, + "epoch": 0.07841727480797114, + "loss/policy_avg": -0.08885104209184647, + "lr": 9.477888548057261e-06, + "objective/entropy": 32.933349609375, + "objective/kl": 40.6209716796875, + "objective/non_score_reward": -2.03104829788208, + "objective/rlhf_reward": -8.124193787574768, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1266255378723145, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.58984375, + "step": 817, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.007244110107422 + }, + { + "episode": 13104, + "epoch": 0.0785131394471007, + "loss/policy_avg": 2.0771470069885254, + "lr": 9.477249488752556e-06, + "objective/entropy": 59.07233810424805, + "objective/kl": 48.80291748046875, + "objective/non_score_reward": -2.440145969390869, + "objective/rlhf_reward": -9.760583877563477, + "objective/scores": 0.0, + "policy/approxkl_avg": 11.791816711425781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 818, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990895986557007 + }, + { + "episode": 13120, + "epoch": 0.07860900408623024, + "loss/policy_avg": 0.09870730340480804, + "lr": 9.476610429447853e-06, + "objective/entropy": -95.96932983398438, + "objective/kl": 34.641910552978516, + "objective/non_score_reward": -1.7320955991744995, + "objective/rlhf_reward": -4.528382396697998, + "objective/scores": 0.6, + "policy/approxkl_avg": 6.980551719665527, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 819, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0003089904785156 + }, + { + "episode": 13136, + "epoch": 0.0787048687253598, + "loss/policy_avg": 0.8834266066551208, + "lr": 9.47597137014315e-06, + "objective/entropy": 111.77400207519531, + "objective/kl": 43.839324951171875, + "objective/non_score_reward": -2.1919662952423096, + "objective/rlhf_reward": -8.767865180969238, + "objective/scores": 0.0, + "policy/approxkl_avg": 25.22732162475586, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.73828125, + "step": 820, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9966464042663574 + }, + { + "episode": 13152, + "epoch": 0.07880073336448934, + "loss/policy_avg": 0.5127257704734802, + "lr": 9.475332310838447e-06, + "objective/entropy": 193.13796997070312, + "objective/kl": 54.76216506958008, + "objective/non_score_reward": -2.7381086349487305, + "objective/rlhf_reward": -9.61079817107263, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 4.750777721405029, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.626953125, + "step": 821, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994168281555176 + }, + { + "episode": 13168, + "epoch": 0.0788965980036189, + "loss/policy_avg": 0.947475790977478, + "lr": 9.474693251533744e-06, + "objective/entropy": 94.34573364257812, + "objective/kl": 57.877140045166016, + "objective/non_score_reward": -2.89385724067688, + "objective/rlhf_reward": -11.575428485870361, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.906923294067383, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4970703125, + "step": 822, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996126413345337 + }, + { + "episode": 13184, + "epoch": 0.07899246264274844, + "loss/policy_avg": 0.5285969376564026, + "lr": 9.474054192229039e-06, + "objective/entropy": 96.22802734375, + "objective/kl": 49.417755126953125, + "objective/non_score_reward": -2.4708876609802246, + "objective/rlhf_reward": -11.883550643920898, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.867500305175781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.720703125, + "step": 823, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988512992858887 + }, + { + "episode": 13200, + "epoch": 0.079088327281878, + "loss/policy_avg": 0.940088152885437, + "lr": 9.473415132924336e-06, + "objective/entropy": 2.389057159423828, + "objective/kl": 48.8427734375, + "objective/non_score_reward": -2.442138671875, + "objective/rlhf_reward": -9.768555164337158, + "objective/scores": 0.0, + "policy/approxkl_avg": 14.532787322998047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.57421875, + "step": 824, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981268644332886 + }, + { + "episode": 13216, + "epoch": 0.07918419192100754, + "loss/policy_avg": 0.08457101136445999, + "lr": 9.472776073619633e-06, + "objective/entropy": 163.4175262451172, + "objective/kl": 43.881683349609375, + "objective/non_score_reward": -2.1940841674804688, + "objective/rlhf_reward": -8.776336789131165, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.60313606262207, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.578125, + "step": 825, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002368927001953 + }, + { + "episode": 13232, + "epoch": 0.07928005656013709, + "loss/policy_avg": 0.37017756700515747, + "lr": 9.47213701431493e-06, + "objective/entropy": 157.41543579101562, + "objective/kl": 48.02391815185547, + "objective/non_score_reward": -2.401196002960205, + "objective/rlhf_reward": -8.279270920783205, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 25.962081909179688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.841796875, + "step": 826, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.995481014251709 + }, + { + "episode": 13248, + "epoch": 0.07937592119926663, + "loss/policy_avg": 1.374605655670166, + "lr": 9.471497955010226e-06, + "objective/entropy": 71.75035095214844, + "objective/kl": 55.77865982055664, + "objective/non_score_reward": -2.788933277130127, + "objective/rlhf_reward": -13.155733108520508, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.36000919342041, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.61328125, + "step": 827, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998390555381775 + }, + { + "episode": 13264, + "epoch": 0.07947178583839619, + "loss/policy_avg": 0.1622915118932724, + "lr": 9.470858895705523e-06, + "objective/entropy": 60.57987976074219, + "objective/kl": 57.39149475097656, + "objective/non_score_reward": -2.869575023651123, + "objective/rlhf_reward": -13.478300094604492, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.24340057373047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.521484375, + "step": 828, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986965656280518 + }, + { + "episode": 13280, + "epoch": 0.07956765047752573, + "loss/policy_avg": 0.19207358360290527, + "lr": 9.470219836400818e-06, + "objective/entropy": -58.32844543457031, + "objective/kl": 51.39813995361328, + "objective/non_score_reward": -2.5699069499969482, + "objective/rlhf_reward": -10.279627680778503, + "objective/scores": 0.0, + "policy/approxkl_avg": 49.676612854003906, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.51953125, + "step": 829, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9977648258209229 + }, + { + "episode": 13296, + "epoch": 0.07966351511665529, + "loss/policy_avg": -0.12824378907680511, + "lr": 9.469580777096115e-06, + "objective/entropy": 10.363025665283203, + "objective/kl": 40.549110412597656, + "objective/non_score_reward": -2.0274555683135986, + "objective/rlhf_reward": -8.109822034835815, + "objective/scores": 0.0, + "policy/approxkl_avg": 44.34107971191406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5, + "step": 830, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0052590370178223 + }, + { + "episode": 13312, + "epoch": 0.07975937975578483, + "loss/policy_avg": 0.6754217147827148, + "lr": 9.468941717791412e-06, + "objective/entropy": 42.773773193359375, + "objective/kl": 46.71219253540039, + "objective/non_score_reward": -2.3356099128723145, + "objective/rlhf_reward": -11.342439651489258, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.964271545410156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.728515625, + "step": 831, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992461204528809 + }, + { + "episode": 13328, + "epoch": 0.07985524439491438, + "loss/policy_avg": -0.004962414503097534, + "lr": 9.468302658486709e-06, + "objective/entropy": -35.318992614746094, + "objective/kl": 44.108856201171875, + "objective/non_score_reward": -2.2054426670074463, + "objective/rlhf_reward": -10.821770668029785, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.2813923358917236, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.60546875, + "step": 832, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.004133701324463 + }, + { + "episode": 13344, + "epoch": 0.07995110903404393, + "loss/policy_avg": 1.4870716333389282, + "lr": 9.467663599182006e-06, + "objective/entropy": -21.525497436523438, + "objective/kl": 34.871070861816406, + "objective/non_score_reward": -1.7435535192489624, + "objective/rlhf_reward": -8.974214553833008, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.968148708343506, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.830078125, + "step": 833, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0055696964263916 + }, + { + "episode": 13360, + "epoch": 0.08004697367317348, + "loss/policy_avg": 0.7669464349746704, + "lr": 9.467024539877301e-06, + "objective/entropy": 50.29515075683594, + "objective/kl": 49.8994026184082, + "objective/non_score_reward": -2.4949703216552734, + "objective/rlhf_reward": -11.979881286621094, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.470100402832031, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.82421875, + "step": 834, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000497579574585 + }, + { + "episode": 13376, + "epoch": 0.08014283831230302, + "loss/policy_avg": 0.23561632633209229, + "lr": 9.466385480572598e-06, + "objective/entropy": 36.33641815185547, + "objective/kl": 47.00213623046875, + "objective/non_score_reward": -2.350106954574585, + "objective/rlhf_reward": -11.40042781829834, + "objective/scores": -0.5, + "policy/approxkl_avg": 83.57771301269531, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.455078125, + "step": 835, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999067783355713 + }, + { + "episode": 13392, + "epoch": 0.08023870295143258, + "loss/policy_avg": 0.46913832426071167, + "lr": 9.465746421267893e-06, + "objective/entropy": 169.9302978515625, + "objective/kl": 47.96323013305664, + "objective/non_score_reward": -2.3981614112854004, + "objective/rlhf_reward": -11.592645645141602, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.6524410247802734, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.70703125, + "step": 836, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001760482788086 + }, + { + "episode": 13408, + "epoch": 0.08033456759056212, + "loss/policy_avg": 0.02686675637960434, + "lr": 9.46510736196319e-06, + "objective/entropy": -79.2550277709961, + "objective/kl": 52.192962646484375, + "objective/non_score_reward": -2.6096482276916504, + "objective/rlhf_reward": -10.43859326839447, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4889419078826904, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4619140625, + "step": 837, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001600742340088 + }, + { + "episode": 13424, + "epoch": 0.08043043222969168, + "loss/policy_avg": -0.3698367476463318, + "lr": 9.464468302658487e-06, + "objective/entropy": -87.23965454101562, + "objective/kl": 41.45216369628906, + "objective/non_score_reward": -2.072608470916748, + "objective/rlhf_reward": -6.979752173623442, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 63.50639343261719, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.548828125, + "step": 838, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.005016803741455 + }, + { + "episode": 13440, + "epoch": 0.08052629686882122, + "loss/policy_avg": 0.18586644530296326, + "lr": 9.463829243353784e-06, + "objective/entropy": 32.29985427856445, + "objective/kl": 44.41960906982422, + "objective/non_score_reward": -2.220980405807495, + "objective/rlhf_reward": -10.88392162322998, + "objective/scores": -0.5, + "policy/approxkl_avg": 72.6720962524414, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.70703125, + "step": 839, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992097616195679 + }, + { + "episode": 13456, + "epoch": 0.08062216150795078, + "loss/policy_avg": 0.269380122423172, + "lr": 9.46319018404908e-06, + "objective/entropy": -125.30670166015625, + "objective/kl": 43.14105224609375, + "objective/non_score_reward": -2.157052516937256, + "objective/rlhf_reward": -7.286574771910338, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 48.048805236816406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.552734375, + "step": 840, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.996854305267334 + }, + { + "episode": 13472, + "epoch": 0.08071802614708032, + "loss/policy_avg": 0.4670361876487732, + "lr": 9.462551124744378e-06, + "objective/entropy": 105.06407928466797, + "objective/kl": 43.80266571044922, + "objective/non_score_reward": -2.1901330947875977, + "objective/rlhf_reward": -10.76053237915039, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.106964111328125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.689453125, + "step": 841, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9962818622589111 + }, + { + "episode": 13488, + "epoch": 0.08081389078620987, + "loss/policy_avg": 0.15953579545021057, + "lr": 9.461912065439673e-06, + "objective/entropy": -35.255577087402344, + "objective/kl": 38.70480728149414, + "objective/non_score_reward": -1.935240387916565, + "objective/rlhf_reward": -3.3409615516662594, + "objective/scores": 1.1, + "policy/approxkl_avg": 171.99822998046875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6015625, + "step": 842, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9976792335510254 + }, + { + "episode": 13504, + "epoch": 0.08090975542533942, + "loss/policy_avg": 0.7678501605987549, + "lr": 9.46127300613497e-06, + "objective/entropy": -160.68154907226562, + "objective/kl": 48.07463836669922, + "objective/non_score_reward": -2.4037318229675293, + "objective/rlhf_reward": -11.614927291870117, + "objective/scores": -0.5, + "policy/approxkl_avg": 54.11241149902344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.732421875, + "step": 843, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9969501495361328 + }, + { + "episode": 13520, + "epoch": 0.08100562006446897, + "loss/policy_avg": 0.614832878112793, + "lr": 9.460633946830267e-06, + "objective/entropy": 175.1295928955078, + "objective/kl": 46.40818405151367, + "objective/non_score_reward": -2.3204092979431152, + "objective/rlhf_reward": -7.970955958566069, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 60.26636505126953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.83203125, + "step": 844, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0007455348968506 + }, + { + "episode": 13536, + "epoch": 0.08110148470359851, + "loss/policy_avg": -0.023550916463136673, + "lr": 9.459994887525563e-06, + "objective/entropy": -42.230506896972656, + "objective/kl": 61.776058197021484, + "objective/non_score_reward": -3.0888028144836426, + "objective/rlhf_reward": -10.693351989210235, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 3.9804036617279053, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 845, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984233379364014 + }, + { + "episode": 13552, + "epoch": 0.08119734934272807, + "loss/policy_avg": 0.044094473123550415, + "lr": 9.45935582822086e-06, + "objective/entropy": 59.97814178466797, + "objective/kl": 46.58905029296875, + "objective/non_score_reward": -2.3294525146484375, + "objective/rlhf_reward": -11.317811012268066, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.484856128692627, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69921875, + "step": 846, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0023152828216553 + }, + { + "episode": 13568, + "epoch": 0.08129321398185761, + "loss/policy_avg": 1.1448770761489868, + "lr": 9.458716768916156e-06, + "objective/entropy": 111.65227508544922, + "objective/kl": 43.068267822265625, + "objective/non_score_reward": -2.1534132957458496, + "objective/rlhf_reward": -8.613653063774109, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.379885673522949, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.642578125, + "step": 847, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0004796981811523 + }, + { + "episode": 13584, + "epoch": 0.08138907862098717, + "loss/policy_avg": 0.8181858658790588, + "lr": 9.458077709611452e-06, + "objective/entropy": -56.673152923583984, + "objective/kl": 48.91077423095703, + "objective/non_score_reward": -2.4455389976501465, + "objective/rlhf_reward": -7.659449162260566, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.835422515869141, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 848, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991991519927979 + }, + { + "episode": 13600, + "epoch": 0.08148494326011671, + "loss/policy_avg": 0.11813211441040039, + "lr": 9.45743865030675e-06, + "objective/entropy": -4.415027618408203, + "objective/kl": 50.83733367919922, + "objective/non_score_reward": -2.5418667793273926, + "objective/rlhf_reward": -8.788865187255245, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 22.646202087402344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 849, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996605634689331 + }, + { + "episode": 13616, + "epoch": 0.08158080789924627, + "loss/policy_avg": 0.11795713007450104, + "lr": 9.456799591002046e-06, + "objective/entropy": 62.60900115966797, + "objective/kl": 45.315032958984375, + "objective/non_score_reward": -2.265751838684082, + "objective/rlhf_reward": -7.721371462851195, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 71.31930541992188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.751953125, + "step": 850, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0022430419921875 + }, + { + "episode": 13632, + "epoch": 0.08167667253837581, + "loss/policy_avg": 0.5159366130828857, + "lr": 9.456160531697343e-06, + "objective/entropy": -175.90101623535156, + "objective/kl": 33.65061950683594, + "objective/non_score_reward": -1.6825311183929443, + "objective/rlhf_reward": -5.214352810176548, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 4.706970691680908, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 851, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9972381591796875 + }, + { + "episode": 13648, + "epoch": 0.08177253717750536, + "loss/policy_avg": 0.5951172113418579, + "lr": 9.45552147239264e-06, + "objective/entropy": -19.370243072509766, + "objective/kl": 45.74876022338867, + "objective/non_score_reward": -2.287438154220581, + "objective/rlhf_reward": -11.149751663208008, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.44702911376953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 852, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.003701686859131 + }, + { + "episode": 13664, + "epoch": 0.0818684018166349, + "loss/policy_avg": -0.09048572182655334, + "lr": 9.454882413087935e-06, + "objective/entropy": -7.4266357421875, + "objective/kl": 34.617713928222656, + "objective/non_score_reward": -1.7308857440948486, + "objective/rlhf_reward": -8.923542976379395, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.7110774517059326, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.771484375, + "step": 853, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0029666423797607 + }, + { + "episode": 13680, + "epoch": 0.08196426645576446, + "loss/policy_avg": 0.6564300656318665, + "lr": 9.454243353783232e-06, + "objective/entropy": -102.97590637207031, + "objective/kl": 40.7784423828125, + "objective/non_score_reward": -2.0389223098754883, + "objective/rlhf_reward": -8.155688762664795, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.9911718368530273, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.71484375, + "step": 854, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001359462738037 + }, + { + "episode": 13696, + "epoch": 0.082060131094894, + "loss/policy_avg": 0.027742426842451096, + "lr": 9.453604294478529e-06, + "objective/entropy": 94.58720397949219, + "objective/kl": 48.34076690673828, + "objective/non_score_reward": -2.4170384407043457, + "objective/rlhf_reward": -8.268153524398803, + "objective/scores": 0.35, + "policy/approxkl_avg": 9.164468765258789, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.837890625, + "step": 855, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0004348754882812 + }, + { + "episode": 13712, + "epoch": 0.08215599573402356, + "loss/policy_avg": 0.08657915890216827, + "lr": 9.452965235173824e-06, + "objective/entropy": -35.318023681640625, + "objective/kl": 40.64269256591797, + "objective/non_score_reward": -2.032134532928467, + "objective/rlhf_reward": -10.128538131713867, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.59064483642578, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.630859375, + "step": 856, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0009994506835938 + }, + { + "episode": 13728, + "epoch": 0.0822518603731531, + "loss/policy_avg": 0.8454070687294006, + "lr": 9.452326175869121e-06, + "objective/entropy": -31.412460327148438, + "objective/kl": 25.98988151550293, + "objective/non_score_reward": -1.2994941473007202, + "objective/rlhf_reward": -3.819374420729977, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 28.413898468017578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71484375, + "step": 857, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0007643699645996 + }, + { + "episode": 13744, + "epoch": 0.08234772501228266, + "loss/policy_avg": 0.3684687614440918, + "lr": 9.451687116564418e-06, + "objective/entropy": -238.05677795410156, + "objective/kl": 37.133724212646484, + "objective/non_score_reward": -1.8566862344741821, + "objective/rlhf_reward": -6.067495071624203, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 2.1579315662384033, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73828125, + "step": 858, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0006093978881836 + }, + { + "episode": 13760, + "epoch": 0.0824435896514122, + "loss/policy_avg": 0.2932976484298706, + "lr": 9.451048057259715e-06, + "objective/entropy": -9.110309600830078, + "objective/kl": 53.04133987426758, + "objective/non_score_reward": -2.652066946029663, + "objective/rlhf_reward": -12.608267784118652, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.479991912841797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.86328125, + "step": 859, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000779628753662 + }, + { + "episode": 13776, + "epoch": 0.08253945429054176, + "loss/policy_avg": 0.08329109847545624, + "lr": 9.45040899795501e-06, + "objective/entropy": 62.41896057128906, + "objective/kl": 49.289058685302734, + "objective/non_score_reward": -2.4644529819488525, + "objective/rlhf_reward": -5.45781192779541, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.0560598373413086, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.58203125, + "step": 860, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0011184215545654 + }, + { + "episode": 13792, + "epoch": 0.0826353189296713, + "loss/policy_avg": 0.11985829472541809, + "lr": 9.449769938650307e-06, + "objective/entropy": 126.35794067382812, + "objective/kl": 59.27776336669922, + "objective/non_score_reward": -2.963888168334961, + "objective/rlhf_reward": -13.855552673339844, + "objective/scores": -0.5, + "policy/approxkl_avg": 20.6051025390625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.724609375, + "step": 861, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9953217506408691 + }, + { + "episode": 13808, + "epoch": 0.08273118356880085, + "loss/policy_avg": 1.0325790643692017, + "lr": 9.449130879345604e-06, + "objective/entropy": 9.744304656982422, + "objective/kl": 37.47538375854492, + "objective/non_score_reward": -1.8737692832946777, + "objective/rlhf_reward": -9.495077133178711, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.714500427246094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 862, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997491717338562 + }, + { + "episode": 13824, + "epoch": 0.0828270482079304, + "loss/policy_avg": 0.17044967412948608, + "lr": 9.4484918200409e-06, + "objective/entropy": -89.74710083007812, + "objective/kl": 33.05152130126953, + "objective/non_score_reward": -1.6525760889053345, + "objective/rlhf_reward": -4.948444967687713, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 2.638071060180664, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.748046875, + "step": 863, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986933469772339 + }, + { + "episode": 13840, + "epoch": 0.08292291284705995, + "loss/policy_avg": 0.7200032472610474, + "lr": 9.447852760736197e-06, + "objective/entropy": 27.844676971435547, + "objective/kl": 56.46900939941406, + "objective/non_score_reward": -2.8234505653381348, + "objective/rlhf_reward": -9.81284964364326, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 7.630801200866699, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.701171875, + "step": 864, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977896213531494 + }, + { + "episode": 13856, + "epoch": 0.08301877748618951, + "loss/policy_avg": 0.22227337956428528, + "lr": 9.447213701431494e-06, + "objective/entropy": -178.34201049804688, + "objective/kl": 38.26736068725586, + "objective/non_score_reward": -1.9133679866790771, + "objective/rlhf_reward": -9.653472900390625, + "objective/scores": -0.5, + "policy/approxkl_avg": 19.848773956298828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.666015625, + "step": 865, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000682830810547 + }, + { + "episode": 13872, + "epoch": 0.08311464212531905, + "loss/policy_avg": 1.1648956537246704, + "lr": 9.44657464212679e-06, + "objective/entropy": 56.50071334838867, + "objective/kl": 42.770015716552734, + "objective/non_score_reward": -2.138500690460205, + "objective/rlhf_reward": -8.55400288105011, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.976739883422852, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.732421875, + "step": 866, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000074625015259 + }, + { + "episode": 13888, + "epoch": 0.0832105067644486, + "loss/policy_avg": 0.04914632439613342, + "lr": 9.445935582822086e-06, + "objective/entropy": 123.36146545410156, + "objective/kl": 44.54328918457031, + "objective/non_score_reward": -2.2271645069122314, + "objective/rlhf_reward": -4.508658146858215, + "objective/scores": 1.1, + "policy/approxkl_avg": 69.43045806884766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.650390625, + "step": 867, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9951891899108887 + }, + { + "episode": 13904, + "epoch": 0.08330637140357815, + "loss/policy_avg": 0.6058514714241028, + "lr": 9.445296523517383e-06, + "objective/entropy": -120.66094970703125, + "objective/kl": 35.81827163696289, + "objective/non_score_reward": -1.790913701057434, + "objective/rlhf_reward": -9.163654327392578, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.4808473587036133, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.677734375, + "step": 868, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999987006187439 + }, + { + "episode": 13920, + "epoch": 0.0834022360427077, + "loss/policy_avg": 0.0709524005651474, + "lr": 9.44465746421268e-06, + "objective/entropy": -70.09716796875, + "objective/kl": 37.903717041015625, + "objective/non_score_reward": -1.8951858282089233, + "objective/rlhf_reward": -4.657024179340574, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.30440616607666, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.87109375, + "step": 869, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9960893392562866 + }, + { + "episode": 13936, + "epoch": 0.08349810068183725, + "loss/policy_avg": 0.1490383893251419, + "lr": 9.444018404907977e-06, + "objective/entropy": -69.44090270996094, + "objective/kl": 40.43378448486328, + "objective/non_score_reward": -2.0216891765594482, + "objective/rlhf_reward": -10.086756706237793, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.21539878845215, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.60546875, + "step": 870, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0008134841918945 + }, + { + "episode": 13952, + "epoch": 0.0835939653209668, + "loss/policy_avg": -0.14227427542209625, + "lr": 9.443379345603272e-06, + "objective/entropy": -204.24081420898438, + "objective/kl": 38.59701156616211, + "objective/non_score_reward": -1.9298505783081055, + "objective/rlhf_reward": -5.894573803218911, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 94.90866088867188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.833984375, + "step": 871, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.996568202972412 + }, + { + "episode": 13968, + "epoch": 0.08368982996009634, + "loss/policy_avg": -0.35594648122787476, + "lr": 9.442740286298569e-06, + "objective/entropy": -88.02035522460938, + "objective/kl": 62.79528045654297, + "objective/non_score_reward": -3.1397640705108643, + "objective/rlhf_reward": -10.159056282043458, + "objective/scores": 0.6, + "policy/approxkl_avg": 14.63131332397461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.78125, + "step": 872, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0070295333862305 + }, + { + "episode": 13984, + "epoch": 0.0837856945992259, + "loss/policy_avg": 0.5958549976348877, + "lr": 9.442101226993866e-06, + "objective/entropy": -48.78741455078125, + "objective/kl": 38.900726318359375, + "objective/non_score_reward": -1.9450364112854004, + "objective/rlhf_reward": -7.780145764350891, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.4714508056640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.67578125, + "step": 873, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0014560222625732 + }, + { + "episode": 14000, + "epoch": 0.08388155923835544, + "loss/policy_avg": 0.3314509987831116, + "lr": 9.441462167689163e-06, + "objective/entropy": 76.92801666259766, + "objective/kl": 40.309932708740234, + "objective/non_score_reward": -2.0154967308044434, + "objective/rlhf_reward": -10.061986923217773, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.135623931884766, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.689453125, + "step": 874, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995381832122803 + }, + { + "episode": 14016, + "epoch": 0.083977423877485, + "loss/policy_avg": 0.32340991497039795, + "lr": 9.44082310838446e-06, + "objective/entropy": 65.0713882446289, + "objective/kl": 63.44895553588867, + "objective/non_score_reward": -3.172447919845581, + "objective/rlhf_reward": -14.689790725708008, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.330820083618164, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.720703125, + "step": 875, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001457691192627 + }, + { + "episode": 14032, + "epoch": 0.08407328851661454, + "loss/policy_avg": 0.32664990425109863, + "lr": 9.440184049079757e-06, + "objective/entropy": -214.559326171875, + "objective/kl": 31.970054626464844, + "objective/non_score_reward": -1.5985026359558105, + "objective/rlhf_reward": -4.789890680376606, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 17.453887939453125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.673828125, + "step": 876, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9978327751159668 + }, + { + "episode": 14048, + "epoch": 0.0841691531557441, + "loss/policy_avg": 0.44483697414398193, + "lr": 9.439544989775052e-06, + "objective/entropy": -130.65757751464844, + "objective/kl": 31.65274429321289, + "objective/non_score_reward": -1.5826371908187866, + "objective/rlhf_reward": -1.9305486440658566, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.450537204742432, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.77734375, + "step": 877, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0155625343322754 + }, + { + "episode": 14064, + "epoch": 0.08426501779487364, + "loss/policy_avg": -0.04940399155020714, + "lr": 9.438905930470349e-06, + "objective/entropy": -106.89505767822266, + "objective/kl": 33.43341064453125, + "objective/non_score_reward": -1.6716704368591309, + "objective/rlhf_reward": -5.130422203746393, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.4062023162841797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59765625, + "step": 878, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9995322227478027 + }, + { + "episode": 14080, + "epoch": 0.08436088243400319, + "loss/policy_avg": 0.871749997138977, + "lr": 9.438266871165644e-06, + "objective/entropy": -9.485435485839844, + "objective/kl": 45.723243713378906, + "objective/non_score_reward": -2.2861623764038086, + "objective/rlhf_reward": -7.411315933863321, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 3.4815430641174316, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.724609375, + "step": 879, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9976509809494019 + }, + { + "episode": 14096, + "epoch": 0.08445674707313273, + "loss/policy_avg": 1.2218818664550781, + "lr": 9.43762781186094e-06, + "objective/entropy": 3.5897598266601562, + "objective/kl": 41.841548919677734, + "objective/non_score_reward": -2.0920774936676025, + "objective/rlhf_reward": -10.36830997467041, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.5200068950653076, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71875, + "step": 880, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0038094520568848 + }, + { + "episode": 14112, + "epoch": 0.08455261171226229, + "loss/policy_avg": 0.51161789894104, + "lr": 9.436988752556238e-06, + "objective/entropy": -167.14944458007812, + "objective/kl": 42.823936462402344, + "objective/non_score_reward": -2.1411969661712646, + "objective/rlhf_reward": -10.564787864685059, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.468809127807617, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7421875, + "step": 881, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9983466863632202 + }, + { + "episode": 14128, + "epoch": 0.08464847635139183, + "loss/policy_avg": 0.24786508083343506, + "lr": 9.436349693251534e-06, + "objective/entropy": 55.18758010864258, + "objective/kl": 50.31825637817383, + "objective/non_score_reward": -2.5159127712249756, + "objective/rlhf_reward": -12.063651084899902, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.60680389404297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859375, + "step": 882, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978296756744385 + }, + { + "episode": 14144, + "epoch": 0.08474434099052139, + "loss/policy_avg": 0.23247212171554565, + "lr": 9.435710633946831e-06, + "objective/entropy": -157.79147338867188, + "objective/kl": 38.8095703125, + "objective/non_score_reward": -1.9404785633087158, + "objective/rlhf_reward": -9.761914253234863, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.605966567993164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 883, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9964613914489746 + }, + { + "episode": 14160, + "epoch": 0.08484020562965093, + "loss/policy_avg": 0.5385473370552063, + "lr": 9.435071574642126e-06, + "objective/entropy": 27.40679931640625, + "objective/kl": 55.815799713134766, + "objective/non_score_reward": -2.79079008102417, + "objective/rlhf_reward": -8.239440594555113, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 28.129718780517578, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.78125, + "step": 884, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9975656270980835 + }, + { + "episode": 14176, + "epoch": 0.08493607026878049, + "loss/policy_avg": 0.36561644077301025, + "lr": 9.434432515337423e-06, + "objective/entropy": -97.6861572265625, + "objective/kl": 43.14677810668945, + "objective/non_score_reward": -2.1573386192321777, + "objective/rlhf_reward": -10.629354476928711, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.133277893066406, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.63671875, + "step": 885, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0011141300201416 + }, + { + "episode": 14192, + "epoch": 0.08503193490791003, + "loss/policy_avg": 0.024884674698114395, + "lr": 9.43379345603272e-06, + "objective/entropy": 120.14835357666016, + "objective/kl": 29.766036987304688, + "objective/non_score_reward": -1.4883018732070923, + "objective/rlhf_reward": -7.953207492828369, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.887042999267578, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.904296875, + "step": 886, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998403549194336 + }, + { + "episode": 14208, + "epoch": 0.08512779954703958, + "loss/policy_avg": 0.22865930199623108, + "lr": 9.433154396728017e-06, + "objective/entropy": 92.45376586914062, + "objective/kl": 33.31108856201172, + "objective/non_score_reward": -1.665554404258728, + "objective/rlhf_reward": -5.320581963568358, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 18.67316436767578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.830078125, + "step": 887, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997815489768982 + }, + { + "episode": 14224, + "epoch": 0.08522366418616913, + "loss/policy_avg": 0.29269108176231384, + "lr": 9.432515337423314e-06, + "objective/entropy": -49.20409393310547, + "objective/kl": 43.717506408691406, + "objective/non_score_reward": -2.185875177383423, + "objective/rlhf_reward": -8.743500709533691, + "objective/scores": 0.0, + "policy/approxkl_avg": 32.744873046875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.625, + "step": 888, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994850158691406 + }, + { + "episode": 14240, + "epoch": 0.08531952882529868, + "loss/policy_avg": -0.05463641881942749, + "lr": 9.431876278118611e-06, + "objective/entropy": -116.11363220214844, + "objective/kl": 32.450172424316406, + "objective/non_score_reward": -1.6225087642669678, + "objective/rlhf_reward": -5.066203196247188, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 74.65079498291016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.55078125, + "step": 889, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99924898147583 + }, + { + "episode": 14256, + "epoch": 0.08541539346442822, + "loss/policy_avg": -0.094816654920578, + "lr": 9.431237218813906e-06, + "objective/entropy": -196.58270263671875, + "objective/kl": 36.66675567626953, + "objective/non_score_reward": -1.8333379030227661, + "objective/rlhf_reward": -7.333351492881775, + "objective/scores": 0.0, + "policy/approxkl_avg": 61.466148376464844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.69921875, + "step": 890, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001875877380371 + }, + { + "episode": 14272, + "epoch": 0.08551125810355778, + "loss/policy_avg": 0.14203904569149017, + "lr": 9.430598159509203e-06, + "objective/entropy": 33.77191925048828, + "objective/kl": 35.84291076660156, + "objective/non_score_reward": -1.7921457290649414, + "objective/rlhf_reward": -7.168582737445831, + "objective/scores": 0.0, + "policy/approxkl_avg": 114.2787857055664, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.740234375, + "step": 891, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0001463890075684 + }, + { + "episode": 14288, + "epoch": 0.08560712274268732, + "loss/policy_avg": 0.8233531713485718, + "lr": 9.4299591002045e-06, + "objective/entropy": -79.67498779296875, + "objective/kl": 32.95916748046875, + "objective/non_score_reward": -1.647958517074585, + "objective/rlhf_reward": -8.59183406829834, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.586498260498047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.68359375, + "step": 892, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001398801803589 + }, + { + "episode": 14304, + "epoch": 0.08570298738181688, + "loss/policy_avg": -0.04238148778676987, + "lr": 9.429320040899797e-06, + "objective/entropy": -91.3941879272461, + "objective/kl": 37.733089447021484, + "objective/non_score_reward": -1.8866543769836426, + "objective/rlhf_reward": -9.54661750793457, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.671231269836426, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7265625, + "step": 893, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999361038208008 + }, + { + "episode": 14320, + "epoch": 0.08579885202094642, + "loss/policy_avg": 0.8634217977523804, + "lr": 9.428680981595094e-06, + "objective/entropy": -187.1932373046875, + "objective/kl": 33.27397537231445, + "objective/non_score_reward": -1.663698673248291, + "objective/rlhf_reward": -5.276192703334194, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 41.47370147705078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.740234375, + "step": 894, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989885091781616 + }, + { + "episode": 14336, + "epoch": 0.08589471666007598, + "loss/policy_avg": 0.5529655814170837, + "lr": 9.42804192229039e-06, + "objective/entropy": -60.362274169921875, + "objective/kl": 24.16449546813965, + "objective/non_score_reward": -1.2082247734069824, + "objective/rlhf_reward": -3.2766399373679906, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 11.35684871673584, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.697265625, + "step": 895, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998338222503662 + }, + { + "episode": 14352, + "epoch": 0.08599058129920552, + "loss/policy_avg": 0.19902795553207397, + "lr": 9.427402862985686e-06, + "objective/entropy": -51.02451705932617, + "objective/kl": 50.956565856933594, + "objective/non_score_reward": -2.547828435897827, + "objective/rlhf_reward": -12.191312789916992, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.6870810985565186, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.63671875, + "step": 896, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997866153717041 + }, + { + "episode": 14368, + "epoch": 0.08608644593833507, + "loss/policy_avg": 0.13916707038879395, + "lr": 9.426763803680982e-06, + "objective/entropy": 1.1563072204589844, + "objective/kl": 40.80864715576172, + "objective/non_score_reward": -2.0404324531555176, + "objective/rlhf_reward": -8.161729454994202, + "objective/scores": 0.0, + "policy/approxkl_avg": 50.02666473388672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.751953125, + "step": 897, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9974308013916016 + }, + { + "episode": 14384, + "epoch": 0.08618231057746462, + "loss/policy_avg": 0.24043650925159454, + "lr": 9.42612474437628e-06, + "objective/entropy": -237.5080108642578, + "objective/kl": 25.429439544677734, + "objective/non_score_reward": -1.2714718580245972, + "objective/rlhf_reward": -7.085887908935547, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.20086669921875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 898, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9987761974334717 + }, + { + "episode": 14400, + "epoch": 0.08627817521659417, + "loss/policy_avg": 0.3025432825088501, + "lr": 9.425485685071576e-06, + "objective/entropy": 24.299400329589844, + "objective/kl": 51.6057014465332, + "objective/non_score_reward": -2.58028507232666, + "objective/rlhf_reward": -7.921140050888061, + "objective/scores": 0.6, + "policy/approxkl_avg": 7.291698455810547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.599609375, + "step": 899, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978258609771729 + }, + { + "episode": 14416, + "epoch": 0.08637403985572371, + "loss/policy_avg": 0.7151256799697876, + "lr": 9.424846625766873e-06, + "objective/entropy": -254.74049377441406, + "objective/kl": 38.361732482910156, + "objective/non_score_reward": -1.9180867671966553, + "objective/rlhf_reward": -9.672347068786621, + "objective/scores": -0.5, + "policy/approxkl_avg": 71.389892578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.61328125, + "step": 900, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9969918727874756 + }, + { + "episode": 14432, + "epoch": 0.08646990449485327, + "loss/policy_avg": 0.24746102094650269, + "lr": 9.424207566462168e-06, + "objective/entropy": -101.08851623535156, + "objective/kl": 41.98121643066406, + "objective/non_score_reward": -2.0990607738494873, + "objective/rlhf_reward": -8.39624297618866, + "objective/scores": 0.0, + "policy/approxkl_avg": 156.69033813476562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4580078125, + "step": 901, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991261959075928 + }, + { + "episode": 14448, + "epoch": 0.08656576913398281, + "loss/policy_avg": 0.22006197273731232, + "lr": 9.423568507157465e-06, + "objective/entropy": -124.14303588867188, + "objective/kl": 49.193702697753906, + "objective/non_score_reward": -2.4596850872039795, + "objective/rlhf_reward": -9.838740348815918, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.835973739624023, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.67578125, + "step": 902, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0029873847961426 + }, + { + "episode": 14464, + "epoch": 0.08666163377311237, + "loss/policy_avg": 0.5352858304977417, + "lr": 9.42292944785276e-06, + "objective/entropy": -157.1105499267578, + "objective/kl": 42.12428665161133, + "objective/non_score_reward": -2.1062145233154297, + "objective/rlhf_reward": -10.424858093261719, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.281829833984375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.630859375, + "step": 903, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992196559906006 + }, + { + "episode": 14480, + "epoch": 0.08675749841224191, + "loss/policy_avg": 0.03661314398050308, + "lr": 9.422290388548057e-06, + "objective/entropy": -62.714019775390625, + "objective/kl": 36.32717514038086, + "objective/non_score_reward": -1.8163588047027588, + "objective/rlhf_reward": -5.318024347500737, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.2670602798461914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.673828125, + "step": 904, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9996294975280762 + }, + { + "episode": 14496, + "epoch": 0.08685336305137147, + "loss/policy_avg": -0.0723339319229126, + "lr": 9.421651329243354e-06, + "objective/entropy": -66.61566162109375, + "objective/kl": 38.04099655151367, + "objective/non_score_reward": -1.9020498991012573, + "objective/rlhf_reward": -9.608200073242188, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.432323932647705, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.708984375, + "step": 905, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.005786418914795 + }, + { + "episode": 14512, + "epoch": 0.08694922769050101, + "loss/policy_avg": -0.2698221802711487, + "lr": 9.421012269938651e-06, + "objective/entropy": -21.453994750976562, + "objective/kl": 46.13621520996094, + "objective/non_score_reward": -2.3068106174468994, + "objective/rlhf_reward": -7.867992841933651, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 19.2186222076416, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.72265625, + "step": 906, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0017006397247314 + }, + { + "episode": 14528, + "epoch": 0.08704509232963056, + "loss/policy_avg": 0.4099076986312866, + "lr": 9.420373210633948e-06, + "objective/entropy": -184.34783935546875, + "objective/kl": 39.42131805419922, + "objective/non_score_reward": -1.9710657596588135, + "objective/rlhf_reward": -9.88426399230957, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.30533218383789, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62890625, + "step": 907, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.995689868927002 + }, + { + "episode": 14544, + "epoch": 0.0871409569687601, + "loss/policy_avg": 0.9004054069519043, + "lr": 9.419734151329245e-06, + "objective/entropy": -60.35547637939453, + "objective/kl": 48.361732482910156, + "objective/non_score_reward": -2.418086528778076, + "objective/rlhf_reward": -9.672346472740173, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.700697898864746, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.591796875, + "step": 908, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999995231628418 + }, + { + "episode": 14560, + "epoch": 0.08723682160788966, + "loss/policy_avg": 0.9981076717376709, + "lr": 9.41909509202454e-06, + "objective/entropy": -102.42435455322266, + "objective/kl": 45.261810302734375, + "objective/non_score_reward": -2.2630903720855713, + "objective/rlhf_reward": -11.052361488342285, + "objective/scores": -0.5, + "policy/approxkl_avg": 54.88935852050781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.583984375, + "step": 909, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993700981140137 + }, + { + "episode": 14576, + "epoch": 0.0873326862470192, + "loss/policy_avg": 0.08262850344181061, + "lr": 9.418456032719837e-06, + "objective/entropy": 41.43779754638672, + "objective/kl": 53.031394958496094, + "objective/non_score_reward": -2.6515698432922363, + "objective/rlhf_reward": -12.606279373168945, + "objective/scores": -0.5, + "policy/approxkl_avg": 115.65996551513672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5390625, + "step": 910, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999428987503052 + }, + { + "episode": 14592, + "epoch": 0.08742855088614876, + "loss/policy_avg": -0.2485816478729248, + "lr": 9.417816973415134e-06, + "objective/entropy": -167.61972045898438, + "objective/kl": 31.972322463989258, + "objective/non_score_reward": -1.598616123199463, + "objective/rlhf_reward": -4.970632572372524, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 3.4127135276794434, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6171875, + "step": 911, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001767873764038 + }, + { + "episode": 14608, + "epoch": 0.0875244155252783, + "loss/policy_avg": 0.14652171730995178, + "lr": 9.41717791411043e-06, + "objective/entropy": -155.92514038085938, + "objective/kl": 43.048545837402344, + "objective/non_score_reward": -2.1524271965026855, + "objective/rlhf_reward": -7.005588564936238, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 9.724783897399902, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.640625, + "step": 912, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9966914653778076 + }, + { + "episode": 14624, + "epoch": 0.08762028016440786, + "loss/policy_avg": 0.5947450995445251, + "lr": 9.416538854805727e-06, + "objective/entropy": -175.47323608398438, + "objective/kl": 29.94611358642578, + "objective/non_score_reward": -1.4973056316375732, + "objective/rlhf_reward": -4.041810880379613, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 24.669536590576172, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7890625, + "step": 913, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9969987869262695 + }, + { + "episode": 14640, + "epoch": 0.0877161448035374, + "loss/policy_avg": 0.05279015377163887, + "lr": 9.415899795501023e-06, + "objective/entropy": -145.57650756835938, + "objective/kl": 41.39718246459961, + "objective/non_score_reward": -2.069859266281128, + "objective/rlhf_reward": -6.617577200353729, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 55.54278564453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.607421875, + "step": 914, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994045495986938 + }, + { + "episode": 14656, + "epoch": 0.08781200944266696, + "loss/policy_avg": 1.1034376621246338, + "lr": 9.41526073619632e-06, + "objective/entropy": -1.6669483184814453, + "objective/kl": 48.39701843261719, + "objective/non_score_reward": -2.4198508262634277, + "objective/rlhf_reward": -11.679403305053711, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.05460262298584, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.71875, + "step": 915, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0021138191223145 + }, + { + "episode": 14672, + "epoch": 0.0879078740817965, + "loss/policy_avg": -0.053219109773635864, + "lr": 9.414621676891616e-06, + "objective/entropy": -173.81130981445312, + "objective/kl": 24.073429107666016, + "objective/non_score_reward": -1.2036715745925903, + "objective/rlhf_reward": -6.814686298370361, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.508467674255371, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.587890625, + "step": 916, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.005868911743164 + }, + { + "episode": 14688, + "epoch": 0.08800373872092605, + "loss/policy_avg": 3.3223953247070312, + "lr": 9.413982617586913e-06, + "objective/entropy": -43.047760009765625, + "objective/kl": 38.19256591796875, + "objective/non_score_reward": -1.90962815284729, + "objective/rlhf_reward": -9.63851261138916, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.597685813903809, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.474609375, + "step": 917, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000534772872925 + }, + { + "episode": 14704, + "epoch": 0.0880996033600556, + "loss/policy_avg": 0.73809415102005, + "lr": 9.41334355828221e-06, + "objective/entropy": 9.220403671264648, + "objective/kl": 38.54597473144531, + "objective/non_score_reward": -1.927298665046692, + "objective/rlhf_reward": -9.709195137023926, + "objective/scores": -0.5, + "policy/approxkl_avg": 47.673072814941406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.71875, + "step": 918, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988300800323486 + }, + { + "episode": 14720, + "epoch": 0.08819546799918515, + "loss/policy_avg": -0.010364736430346966, + "lr": 9.412704498977507e-06, + "objective/entropy": -142.30372619628906, + "objective/kl": 27.718883514404297, + "objective/non_score_reward": -1.3859442472457886, + "objective/rlhf_reward": -5.543776988983154, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1655259132385254, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.666015625, + "step": 919, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001488208770752 + }, + { + "episode": 14736, + "epoch": 0.08829133263831469, + "loss/policy_avg": 0.09719185531139374, + "lr": 9.412065439672802e-06, + "objective/entropy": -229.5596466064453, + "objective/kl": 23.601909637451172, + "objective/non_score_reward": -1.1800954341888428, + "objective/rlhf_reward": -4.720382034778595, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.1861003637313843, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.580078125, + "step": 920, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.00490665435791 + }, + { + "episode": 14752, + "epoch": 0.08838719727744425, + "loss/policy_avg": -0.004567261785268784, + "lr": 9.411426380368099e-06, + "objective/entropy": -190.52105712890625, + "objective/kl": 33.38599395751953, + "objective/non_score_reward": -1.6692997217178345, + "objective/rlhf_reward": -2.2771991252899166, + "objective/scores": 1.1, + "policy/approxkl_avg": 68.6070556640625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.720703125, + "step": 921, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9975864887237549 + }, + { + "episode": 14768, + "epoch": 0.0884830619165738, + "loss/policy_avg": 0.4888562560081482, + "lr": 9.410787321063396e-06, + "objective/entropy": -144.64292907714844, + "objective/kl": 37.262916564941406, + "objective/non_score_reward": -1.8631458282470703, + "objective/rlhf_reward": -7.452583193778992, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.921217918395996, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.521484375, + "step": 922, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9970510005950928 + }, + { + "episode": 14784, + "epoch": 0.08857892655570335, + "loss/policy_avg": 0.7242112159729004, + "lr": 9.410148261758691e-06, + "objective/entropy": -117.64860534667969, + "objective/kl": 38.69379806518555, + "objective/non_score_reward": -1.9346898794174194, + "objective/rlhf_reward": -6.076900010526764, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 44.3615608215332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.572265625, + "step": 923, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978076219558716 + }, + { + "episode": 14800, + "epoch": 0.0886747911948329, + "loss/policy_avg": 0.3952575922012329, + "lr": 9.409509202453988e-06, + "objective/entropy": -58.82710647583008, + "objective/kl": 45.47815704345703, + "objective/non_score_reward": -2.2739081382751465, + "objective/rlhf_reward": -7.433772688329803, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 25.212846755981445, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.66015625, + "step": 924, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973945617675781 + }, + { + "episode": 14816, + "epoch": 0.08877065583396244, + "loss/policy_avg": 0.06768083572387695, + "lr": 9.408870143149285e-06, + "objective/entropy": -222.38186645507812, + "objective/kl": 26.613353729248047, + "objective/non_score_reward": -1.3306677341461182, + "objective/rlhf_reward": -5.322670936584473, + "objective/scores": 0.0, + "policy/approxkl_avg": 52.69890594482422, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.787109375, + "step": 925, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002758741378784 + }, + { + "episode": 14832, + "epoch": 0.088866520473092, + "loss/policy_avg": 0.37387263774871826, + "lr": 9.408231083844582e-06, + "objective/entropy": -61.06507873535156, + "objective/kl": 39.82828903198242, + "objective/non_score_reward": -1.9914145469665527, + "objective/rlhf_reward": -3.5656581878662106, + "objective/scores": 1.1, + "policy/approxkl_avg": 47.79099655151367, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48828125, + "step": 926, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988874197006226 + }, + { + "episode": 14848, + "epoch": 0.08896238511222154, + "loss/policy_avg": -0.01355433464050293, + "lr": 9.407592024539877e-06, + "objective/entropy": 41.936790466308594, + "objective/kl": 46.56926727294922, + "objective/non_score_reward": -2.328463315963745, + "objective/rlhf_reward": -11.31385326385498, + "objective/scores": -0.5, + "policy/approxkl_avg": 105.09312438964844, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8125, + "step": 927, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0027990341186523 + }, + { + "episode": 14864, + "epoch": 0.0890582497513511, + "loss/policy_avg": 0.818713903427124, + "lr": 9.406952965235174e-06, + "objective/entropy": -219.26345825195312, + "objective/kl": 38.234500885009766, + "objective/non_score_reward": -1.9117250442504883, + "objective/rlhf_reward": -9.646900177001953, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.689947128295898, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58203125, + "step": 928, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998668432235718 + }, + { + "episode": 14880, + "epoch": 0.08915411439048064, + "loss/policy_avg": 0.06567949056625366, + "lr": 9.40631390593047e-06, + "objective/entropy": -158.48692321777344, + "objective/kl": 35.28874588012695, + "objective/non_score_reward": -1.764437198638916, + "objective/rlhf_reward": -7.057748913764954, + "objective/scores": 0.0, + "policy/approxkl_avg": 38.171913146972656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.732421875, + "step": 929, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.005028486251831 + }, + { + "episode": 14896, + "epoch": 0.0892499790296102, + "loss/policy_avg": 0.06915108114480972, + "lr": 9.405674846625768e-06, + "objective/entropy": -48.55313491821289, + "objective/kl": 31.88959503173828, + "objective/non_score_reward": -1.5944796800613403, + "objective/rlhf_reward": -5.06723760624826, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 99.60332489013672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6796875, + "step": 930, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999077320098877 + }, + { + "episode": 14912, + "epoch": 0.08934584366873974, + "loss/policy_avg": -0.039553724229335785, + "lr": 9.405035787321065e-06, + "objective/entropy": -52.474571228027344, + "objective/kl": 50.911964416503906, + "objective/non_score_reward": -2.545598030090332, + "objective/rlhf_reward": -12.182392120361328, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.06097412109375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.623046875, + "step": 931, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978675842285156 + }, + { + "episode": 14928, + "epoch": 0.0894417083078693, + "loss/policy_avg": 0.0016290303319692612, + "lr": 9.404396728016361e-06, + "objective/entropy": -83.39149475097656, + "objective/kl": 45.000240325927734, + "objective/non_score_reward": -2.250011920928955, + "objective/rlhf_reward": -7.338188534200775, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 2.066300868988037, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.669921875, + "step": 932, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0031819343566895 + }, + { + "episode": 14944, + "epoch": 0.08953757294699884, + "loss/policy_avg": 0.7307361364364624, + "lr": 9.403757668711657e-06, + "objective/entropy": -57.11241149902344, + "objective/kl": 50.165870666503906, + "objective/non_score_reward": -2.508293628692627, + "objective/rlhf_reward": -12.033174514770508, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.401668548583984, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.708984375, + "step": 933, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9973030090332031 + }, + { + "episode": 14960, + "epoch": 0.08963343758612839, + "loss/policy_avg": 0.723739504814148, + "lr": 9.403118609406953e-06, + "objective/entropy": -76.68917846679688, + "objective/kl": 37.09356689453125, + "objective/non_score_reward": -1.8546783924102783, + "objective/rlhf_reward": -9.418713569641113, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.10303497314453, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.697265625, + "step": 934, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9975576400756836 + }, + { + "episode": 14976, + "epoch": 0.08972930222525793, + "loss/policy_avg": 0.13655754923820496, + "lr": 9.40247955010225e-06, + "objective/entropy": -187.17822265625, + "objective/kl": 26.66393280029297, + "objective/non_score_reward": -1.3331966400146484, + "objective/rlhf_reward": -5.332786798477173, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1211817264556885, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.677734375, + "step": 935, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0001492500305176 + }, + { + "episode": 14992, + "epoch": 0.08982516686438749, + "loss/policy_avg": 0.08366826176643372, + "lr": 9.401840490797547e-06, + "objective/entropy": -112.3772201538086, + "objective/kl": 43.602718353271484, + "objective/non_score_reward": -2.180135726928711, + "objective/rlhf_reward": -7.164284079280451, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 58.001258850097656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.669921875, + "step": 936, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998914361000061 + }, + { + "episode": 15008, + "epoch": 0.08992103150351703, + "loss/policy_avg": 0.2768528163433075, + "lr": 9.401201431492844e-06, + "objective/entropy": -176.91864013671875, + "objective/kl": 37.214691162109375, + "objective/non_score_reward": -1.860734462738037, + "objective/rlhf_reward": -9.442937850952148, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.972652435302734, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.62890625, + "step": 937, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00278902053833 + }, + { + "episode": 15024, + "epoch": 0.09001689614264659, + "loss/policy_avg": 0.08134534955024719, + "lr": 9.40056237218814e-06, + "objective/entropy": -20.97055435180664, + "objective/kl": 38.23248291015625, + "objective/non_score_reward": -1.9116241931915283, + "objective/rlhf_reward": -7.646496891975403, + "objective/scores": 0.0, + "policy/approxkl_avg": 37.703094482421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.80859375, + "step": 938, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997063398361206 + }, + { + "episode": 15040, + "epoch": 0.09011276078177613, + "loss/policy_avg": 0.3275008797645569, + "lr": 9.399923312883436e-06, + "objective/entropy": -126.31497192382812, + "objective/kl": 29.49127197265625, + "objective/non_score_reward": -1.4745635986328125, + "objective/rlhf_reward": -7.898254871368408, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.51213836669922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6171875, + "step": 939, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998835802078247 + }, + { + "episode": 15056, + "epoch": 0.09020862542090569, + "loss/policy_avg": 0.4928390383720398, + "lr": 9.399284253578733e-06, + "objective/entropy": -25.302574157714844, + "objective/kl": 38.14933776855469, + "objective/non_score_reward": -1.9074668884277344, + "objective/rlhf_reward": -5.682456324772771, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 17.401504516601562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.703125, + "step": 940, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999890327453613 + }, + { + "episode": 15072, + "epoch": 0.09030449006003523, + "loss/policy_avg": 0.11759189516305923, + "lr": 9.39864519427403e-06, + "objective/entropy": -86.40337371826172, + "objective/kl": 40.93019485473633, + "objective/non_score_reward": -2.0465097427368164, + "objective/rlhf_reward": -10.186038970947266, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.425281524658203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.603515625, + "step": 941, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997210502624512 + }, + { + "episode": 15088, + "epoch": 0.09040035469916478, + "loss/policy_avg": -0.10709141939878464, + "lr": 9.398006134969327e-06, + "objective/entropy": -81.64563751220703, + "objective/kl": 51.18853759765625, + "objective/non_score_reward": -2.55942702293396, + "objective/rlhf_reward": -12.23770809173584, + "objective/scores": -0.5, + "policy/approxkl_avg": 48.02937316894531, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.501953125, + "step": 942, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0021045207977295 + }, + { + "episode": 15104, + "epoch": 0.09049621933829433, + "loss/policy_avg": 0.3953965902328491, + "lr": 9.397367075664624e-06, + "objective/entropy": -99.81330871582031, + "objective/kl": 32.06804275512695, + "objective/non_score_reward": -1.6034021377563477, + "objective/rlhf_reward": -4.809488449160176, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 40.51140594482422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.59765625, + "step": 943, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988559484481812 + }, + { + "episode": 15120, + "epoch": 0.09059208397742388, + "loss/policy_avg": 0.2747941017150879, + "lr": 9.396728016359919e-06, + "objective/entropy": -201.62356567382812, + "objective/kl": 39.66442108154297, + "objective/non_score_reward": -1.9832209348678589, + "objective/rlhf_reward": -9.932884216308594, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.135772705078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5390625, + "step": 944, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9971367120742798 + }, + { + "episode": 15136, + "epoch": 0.09068794861655342, + "loss/policy_avg": 0.20382410287857056, + "lr": 9.396088957055216e-06, + "objective/entropy": -125.5006103515625, + "objective/kl": 38.4227294921875, + "objective/non_score_reward": -1.921136498451233, + "objective/rlhf_reward": -6.168774330409702, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 32.14828109741211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.486328125, + "step": 945, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9955699443817139 + }, + { + "episode": 15152, + "epoch": 0.09078381325568298, + "loss/policy_avg": -0.19265419244766235, + "lr": 9.395449897750511e-06, + "objective/entropy": -112.85115814208984, + "objective/kl": 43.549034118652344, + "objective/non_score_reward": -2.1774516105651855, + "objective/rlhf_reward": -8.7098069190979, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.38921356201172, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.53125, + "step": 946, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999749660491943 + }, + { + "episode": 15168, + "epoch": 0.09087967789481252, + "loss/policy_avg": 0.09030474722385406, + "lr": 9.394810838445808e-06, + "objective/entropy": -230.20339965820312, + "objective/kl": 35.29817199707031, + "objective/non_score_reward": -1.7649086713790894, + "objective/rlhf_reward": -2.659634685516357, + "objective/scores": 1.1, + "policy/approxkl_avg": 13.55244255065918, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.72265625, + "step": 947, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001312017440796 + }, + { + "episode": 15184, + "epoch": 0.09097554253394208, + "loss/policy_avg": -0.5793415904045105, + "lr": 9.394171779141105e-06, + "objective/entropy": -144.71697998046875, + "objective/kl": 50.93728256225586, + "objective/non_score_reward": -2.5468640327453613, + "objective/rlhf_reward": -12.187456130981445, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.086793899536133, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.599609375, + "step": 948, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0038492679595947 + }, + { + "episode": 15200, + "epoch": 0.09107140717307162, + "loss/policy_avg": 0.31683194637298584, + "lr": 9.393532719836402e-06, + "objective/entropy": -211.3304901123047, + "objective/kl": 30.781959533691406, + "objective/non_score_reward": -1.5390980243682861, + "objective/rlhf_reward": -4.552272114817219, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 4.273219108581543, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.666015625, + "step": 949, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984002113342285 + }, + { + "episode": 15216, + "epoch": 0.09116727181220118, + "loss/policy_avg": 0.1032361388206482, + "lr": 9.392893660531698e-06, + "objective/entropy": 21.91994285583496, + "objective/kl": 32.9478645324707, + "objective/non_score_reward": -1.6473931074142456, + "objective/rlhf_reward": -5.033313124385431, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 17.144466400146484, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.794921875, + "step": 950, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996211528778076 + }, + { + "episode": 15232, + "epoch": 0.09126313645133072, + "loss/policy_avg": 0.5440107583999634, + "lr": 9.392254601226994e-06, + "objective/entropy": -53.61236572265625, + "objective/kl": 37.240875244140625, + "objective/non_score_reward": -1.862043857574463, + "objective/rlhf_reward": -9.448175430297852, + "objective/scores": -0.5, + "policy/approxkl_avg": 81.52009582519531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 951, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9972529411315918 + }, + { + "episode": 15248, + "epoch": 0.09135900109046027, + "loss/policy_avg": 0.14197972416877747, + "lr": 9.39161554192229e-06, + "objective/entropy": -139.08221435546875, + "objective/kl": 46.49334716796875, + "objective/non_score_reward": -2.324667453765869, + "objective/rlhf_reward": -11.298669815063477, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.933968544006348, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 952, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9968202114105225 + }, + { + "episode": 15264, + "epoch": 0.09145486572958982, + "loss/policy_avg": 0.25565198063850403, + "lr": 9.390976482617587e-06, + "objective/entropy": -55.819522857666016, + "objective/kl": 46.55219650268555, + "objective/non_score_reward": -2.3276100158691406, + "objective/rlhf_reward": -7.99975906868875, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 6.205958366394043, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65625, + "step": 953, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001270055770874 + }, + { + "episode": 15280, + "epoch": 0.09155073036871937, + "loss/policy_avg": 0.16393278539180756, + "lr": 9.390337423312884e-06, + "objective/entropy": -192.65573120117188, + "objective/kl": 49.46628189086914, + "objective/non_score_reward": -2.4733142852783203, + "objective/rlhf_reward": -8.442659000964507, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 118.62382507324219, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5625, + "step": 954, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983826875686646 + }, + { + "episode": 15296, + "epoch": 0.09164659500784891, + "loss/policy_avg": 0.6506268978118896, + "lr": 9.389698364008181e-06, + "objective/entropy": -1.021087646484375, + "objective/kl": 32.492759704589844, + "objective/non_score_reward": -1.6246379613876343, + "objective/rlhf_reward": -5.074719984729853, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 149.66799926757812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.81640625, + "step": 955, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9964890480041504 + }, + { + "episode": 15312, + "epoch": 0.09174245964697847, + "loss/policy_avg": 0.010403938591480255, + "lr": 9.389059304703478e-06, + "objective/entropy": -23.017173767089844, + "objective/kl": 35.98017120361328, + "objective/non_score_reward": -1.7990086078643799, + "objective/rlhf_reward": -9.19603443145752, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.22949981689453, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.76953125, + "step": 956, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990930557250977 + }, + { + "episode": 15328, + "epoch": 0.09183832428610801, + "loss/policy_avg": -0.30354610085487366, + "lr": 9.388420245398773e-06, + "objective/entropy": -150.84063720703125, + "objective/kl": 33.909183502197266, + "objective/non_score_reward": -1.6954591274261475, + "objective/rlhf_reward": -3.8581174954187603, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 53.11094284057617, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.65625, + "step": 957, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998666763305664 + }, + { + "episode": 15344, + "epoch": 0.09193418892523757, + "loss/policy_avg": 0.18794779479503632, + "lr": 9.38778118609407e-06, + "objective/entropy": 79.04150390625, + "objective/kl": 33.25880432128906, + "objective/non_score_reward": -1.662940263748169, + "objective/rlhf_reward": -8.651761054992676, + "objective/scores": -0.5, + "policy/approxkl_avg": 52.56779098510742, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.63671875, + "step": 958, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.995115876197815 + }, + { + "episode": 15360, + "epoch": 0.09203005356436711, + "loss/policy_avg": 0.3746708631515503, + "lr": 9.387142126789367e-06, + "objective/entropy": -139.40573120117188, + "objective/kl": 45.40739440917969, + "objective/non_score_reward": -2.2703697681427, + "objective/rlhf_reward": -9.081478834152222, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.837141990661621, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.57421875, + "step": 959, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9977433681488037 + }, + { + "episode": 15376, + "epoch": 0.09212591820349667, + "loss/policy_avg": 0.24001380801200867, + "lr": 9.386503067484664e-06, + "objective/entropy": -159.99990844726562, + "objective/kl": 29.465959548950195, + "objective/non_score_reward": -1.4732978343963623, + "objective/rlhf_reward": -4.442593435855255, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 29.260818481445312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.73046875, + "step": 960, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993481636047363 + }, + { + "episode": 15392, + "epoch": 0.09222178284262621, + "loss/policy_avg": 1.6455062627792358, + "lr": 9.38586400817996e-06, + "objective/entropy": -34.35570526123047, + "objective/kl": 38.343467712402344, + "objective/non_score_reward": -1.9171736240386963, + "objective/rlhf_reward": -7.668694496154785, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.91456937789917, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.544921875, + "step": 961, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982693195343018 + }, + { + "episode": 15408, + "epoch": 0.09231764748175576, + "loss/policy_avg": 0.08633384108543396, + "lr": 9.385224948875256e-06, + "objective/entropy": -82.11976623535156, + "objective/kl": 39.867191314697266, + "objective/non_score_reward": -1.9933594465255737, + "objective/rlhf_reward": -9.973438262939453, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.890167236328125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.615234375, + "step": 962, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998676061630249 + }, + { + "episode": 15424, + "epoch": 0.0924135121208853, + "loss/policy_avg": 0.27613601088523865, + "lr": 9.384585889570553e-06, + "objective/entropy": -160.2120361328125, + "objective/kl": 45.641685485839844, + "objective/non_score_reward": -2.2820839881896973, + "objective/rlhf_reward": -11.128335952758789, + "objective/scores": -0.5, + "policy/approxkl_avg": 65.47406005859375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.607421875, + "step": 963, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.000150442123413 + }, + { + "episode": 15440, + "epoch": 0.09250937676001486, + "loss/policy_avg": 0.1509678214788437, + "lr": 9.38394683026585e-06, + "objective/entropy": -80.69264221191406, + "objective/kl": 33.72905731201172, + "objective/non_score_reward": -1.686452865600586, + "objective/rlhf_reward": -6.745811700820923, + "objective/scores": 0.0, + "policy/approxkl_avg": 20.697582244873047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6796875, + "step": 964, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.995528221130371 + }, + { + "episode": 15456, + "epoch": 0.0926052413991444, + "loss/policy_avg": 0.6833748817443848, + "lr": 9.383307770961147e-06, + "objective/entropy": -51.16916275024414, + "objective/kl": 41.589149475097656, + "objective/non_score_reward": -2.0794572830200195, + "objective/rlhf_reward": -10.317829132080078, + "objective/scores": -0.5, + "policy/approxkl_avg": 20.314876556396484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.583984375, + "step": 965, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978783130645752 + }, + { + "episode": 15472, + "epoch": 0.09270110603827396, + "loss/policy_avg": 0.9105867743492126, + "lr": 9.382668711656443e-06, + "objective/entropy": -192.4179229736328, + "objective/kl": 40.383575439453125, + "objective/non_score_reward": -2.019178867340088, + "objective/rlhf_reward": -8.07671570777893, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.753993988037109, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.587890625, + "step": 966, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0001120567321777 + }, + { + "episode": 15488, + "epoch": 0.0927969706774035, + "loss/policy_avg": -0.21336907148361206, + "lr": 9.382029652351739e-06, + "objective/entropy": 5.789453506469727, + "objective/kl": 58.350379943847656, + "objective/non_score_reward": -2.9175190925598145, + "objective/rlhf_reward": -13.670076370239258, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.937045097351074, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.642578125, + "step": 967, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0048317909240723 + }, + { + "episode": 15504, + "epoch": 0.09289283531653306, + "loss/policy_avg": 0.7795835733413696, + "lr": 9.381390593047035e-06, + "objective/entropy": -190.95095825195312, + "objective/kl": 44.154022216796875, + "objective/non_score_reward": -2.2077012062072754, + "objective/rlhf_reward": -8.830804228782654, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.231604099273682, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4375, + "step": 968, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000349283218384 + }, + { + "episode": 15520, + "epoch": 0.0929886999556626, + "loss/policy_avg": 0.37494808435440063, + "lr": 9.380751533742332e-06, + "objective/entropy": -196.76791381835938, + "objective/kl": 46.036712646484375, + "objective/non_score_reward": -2.301835536956787, + "objective/rlhf_reward": -9.207342028617859, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.0788402557373047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.568359375, + "step": 969, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999101996421814 + }, + { + "episode": 15536, + "epoch": 0.09308456459479215, + "loss/policy_avg": 0.1790609210729599, + "lr": 9.380112474437628e-06, + "objective/entropy": -64.12092590332031, + "objective/kl": 31.128215789794922, + "objective/non_score_reward": -1.556410789489746, + "objective/rlhf_reward": -6.225643157958984, + "objective/scores": 0.0, + "policy/approxkl_avg": 30.929574966430664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 970, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9978744983673096 + }, + { + "episode": 15552, + "epoch": 0.0931804292339217, + "loss/policy_avg": 0.32251641154289246, + "lr": 9.379473415132924e-06, + "objective/entropy": -37.61677932739258, + "objective/kl": 33.060455322265625, + "objective/non_score_reward": -1.6530228853225708, + "objective/rlhf_reward": -5.252841675017757, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 44.70690155029297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.783203125, + "step": 971, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9973013401031494 + }, + { + "episode": 15568, + "epoch": 0.09327629387305125, + "loss/policy_avg": 0.17544952034950256, + "lr": 9.378834355828221e-06, + "objective/entropy": -186.7581787109375, + "objective/kl": 43.407676696777344, + "objective/non_score_reward": -2.170383930206299, + "objective/rlhf_reward": -8.681535363197327, + "objective/scores": 0.0, + "policy/approxkl_avg": 34.08558654785156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.634765625, + "step": 972, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9980778694152832 + }, + { + "episode": 15584, + "epoch": 0.0933721585121808, + "loss/policy_avg": 1.3969241380691528, + "lr": 9.378195296523518e-06, + "objective/entropy": -110.88290405273438, + "objective/kl": 33.68537139892578, + "objective/non_score_reward": -1.684268593788147, + "objective/rlhf_reward": -6.737074494361877, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.057842254638672, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5078125, + "step": 973, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0030934810638428 + }, + { + "episode": 15600, + "epoch": 0.09346802315131035, + "loss/policy_avg": 0.6720627546310425, + "lr": 9.377556237218815e-06, + "objective/entropy": -42.06459045410156, + "objective/kl": 39.157737731933594, + "objective/non_score_reward": -1.9578869342803955, + "objective/rlhf_reward": -7.831547379493713, + "objective/scores": 0.0, + "policy/approxkl_avg": 41.041465759277344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.619140625, + "step": 974, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99678373336792 + }, + { + "episode": 15616, + "epoch": 0.09356388779043989, + "loss/policy_avg": 0.27696704864501953, + "lr": 9.37691717791411e-06, + "objective/entropy": -130.134765625, + "objective/kl": 36.255157470703125, + "objective/non_score_reward": -1.8127578496932983, + "objective/rlhf_reward": -5.7352598545872535, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 13.770370483398438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.638671875, + "step": 975, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001678705215454 + }, + { + "episode": 15632, + "epoch": 0.09365975242956945, + "loss/policy_avg": 0.34242314100265503, + "lr": 9.376278118609407e-06, + "objective/entropy": -213.66635131835938, + "objective/kl": 43.86079025268555, + "objective/non_score_reward": -2.193039655685425, + "objective/rlhf_reward": -8.77215838432312, + "objective/scores": 0.0, + "policy/approxkl_avg": 101.20123291015625, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4951171875, + "step": 976, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002193450927734 + }, + { + "episode": 15648, + "epoch": 0.09375561706869899, + "loss/policy_avg": 0.3875897228717804, + "lr": 9.375639059304704e-06, + "objective/entropy": -57.457542419433594, + "objective/kl": 37.4965705871582, + "objective/non_score_reward": -1.874828815460205, + "objective/rlhf_reward": -5.37660855271009, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 55.72274398803711, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.80078125, + "step": 977, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9988607168197632 + }, + { + "episode": 15664, + "epoch": 0.09385148170782855, + "loss/policy_avg": 0.14154575765132904, + "lr": 9.375000000000001e-06, + "objective/entropy": -50.779911041259766, + "objective/kl": 37.54152297973633, + "objective/non_score_reward": -1.8770761489868164, + "objective/rlhf_reward": -7.508304834365845, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.101675033569336, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5625, + "step": 978, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001704692840576 + }, + { + "episode": 15680, + "epoch": 0.09394734634695809, + "loss/policy_avg": 0.05301086604595184, + "lr": 9.374360940695298e-06, + "objective/entropy": -237.26416015625, + "objective/kl": 33.35430145263672, + "objective/non_score_reward": -1.6677148342132568, + "objective/rlhf_reward": -6.670859634876251, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.969975471496582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 979, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985926151275635 + }, + { + "episode": 15696, + "epoch": 0.09404321098608764, + "loss/policy_avg": 0.44358035922050476, + "lr": 9.373721881390595e-06, + "objective/entropy": -99.7123794555664, + "objective/kl": 38.239044189453125, + "objective/non_score_reward": -1.9119523763656616, + "objective/rlhf_reward": -7.6478095054626465, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.88211441040039, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.640625, + "step": 980, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.996304988861084 + }, + { + "episode": 15712, + "epoch": 0.0941390756252172, + "loss/policy_avg": -0.010519023984670639, + "lr": 9.37308282208589e-06, + "objective/entropy": -201.13967895507812, + "objective/kl": 37.26094055175781, + "objective/non_score_reward": -1.8630470037460327, + "objective/rlhf_reward": -7.452187776565552, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6627287864685059, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6328125, + "step": 981, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0017895698547363 + }, + { + "episode": 15728, + "epoch": 0.09423494026434674, + "loss/policy_avg": 0.19608743488788605, + "lr": 9.372443762781187e-06, + "objective/entropy": -134.354248046875, + "objective/kl": 34.5666389465332, + "objective/non_score_reward": -1.7283319234848022, + "objective/rlhf_reward": -6.913327574729919, + "objective/scores": 0.0, + "policy/approxkl_avg": 22.905805587768555, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.60546875, + "step": 982, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987757205963135 + }, + { + "episode": 15744, + "epoch": 0.0943308049034763, + "loss/policy_avg": 1.1836085319519043, + "lr": 9.371804703476484e-06, + "objective/entropy": -148.82774353027344, + "objective/kl": 38.39320755004883, + "objective/non_score_reward": -1.9196603298187256, + "objective/rlhf_reward": -9.678642272949219, + "objective/scores": -0.5, + "policy/approxkl_avg": 24.917579650878906, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.62890625, + "step": 983, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988484382629395 + }, + { + "episode": 15760, + "epoch": 0.09442666954260584, + "loss/policy_avg": 0.0741516649723053, + "lr": 9.37116564417178e-06, + "objective/entropy": -214.27142333984375, + "objective/kl": 29.813629150390625, + "objective/non_score_reward": -1.490681529045105, + "objective/rlhf_reward": -5.962726056575775, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.328061580657959, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.662109375, + "step": 984, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989229440689087 + }, + { + "episode": 15776, + "epoch": 0.0945225341817354, + "loss/policy_avg": 0.4623241126537323, + "lr": 9.370526584867077e-06, + "objective/entropy": -67.21552276611328, + "objective/kl": 34.03821563720703, + "objective/non_score_reward": -1.7019107341766357, + "objective/rlhf_reward": -6.807642936706543, + "objective/scores": 0.0, + "policy/approxkl_avg": 101.1454849243164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.724609375, + "step": 985, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.996826410293579 + }, + { + "episode": 15792, + "epoch": 0.09461839882086494, + "loss/policy_avg": -0.0862913578748703, + "lr": 9.369887525562373e-06, + "objective/entropy": -131.0811309814453, + "objective/kl": 38.760162353515625, + "objective/non_score_reward": -1.9380083084106445, + "objective/rlhf_reward": -7.752032995223999, + "objective/scores": 0.0, + "policy/approxkl_avg": 22.67949104309082, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.546875, + "step": 986, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002801418304443 + }, + { + "episode": 15808, + "epoch": 0.0947142634599945, + "loss/policy_avg": -0.11611436307430267, + "lr": 9.36924846625767e-06, + "objective/entropy": -118.61439514160156, + "objective/kl": 33.61301803588867, + "objective/non_score_reward": -1.6806509494781494, + "objective/rlhf_reward": -5.344001808253628, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 5.671133518218994, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.587890625, + "step": 987, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001059055328369 + }, + { + "episode": 15824, + "epoch": 0.09481012809912404, + "loss/policy_avg": -0.056146666407585144, + "lr": 9.368609406952966e-06, + "objective/entropy": -147.88731384277344, + "objective/kl": 40.43733215332031, + "objective/non_score_reward": -2.0218665599823, + "objective/rlhf_reward": -8.087466716766357, + "objective/scores": 0.0, + "policy/approxkl_avg": 40.682838439941406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.705078125, + "step": 988, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.998780369758606 + }, + { + "episode": 15840, + "epoch": 0.09490599273825359, + "loss/policy_avg": 0.6434881687164307, + "lr": 9.367970347648263e-06, + "objective/entropy": -209.21951293945312, + "objective/kl": 34.311981201171875, + "objective/non_score_reward": -1.7155991792678833, + "objective/rlhf_reward": -5.43856461783227, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 17.39712142944336, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.615234375, + "step": 989, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998345375061035 + }, + { + "episode": 15856, + "epoch": 0.09500185737738313, + "loss/policy_avg": 0.5555615425109863, + "lr": 9.367331288343558e-06, + "objective/entropy": -216.50015258789062, + "objective/kl": 41.78931427001953, + "objective/non_score_reward": -2.089465856552124, + "objective/rlhf_reward": -6.934031446178523, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 8.619867324829102, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.615234375, + "step": 990, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9983088970184326 + }, + { + "episode": 15872, + "epoch": 0.09509772201651269, + "loss/policy_avg": 0.014517553150653839, + "lr": 9.366692229038855e-06, + "objective/entropy": -103.5342788696289, + "objective/kl": 45.42181396484375, + "objective/non_score_reward": -2.2710909843444824, + "objective/rlhf_reward": -7.7251133558496665, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 122.64480590820312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.560546875, + "step": 991, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9976680278778076 + }, + { + "episode": 15888, + "epoch": 0.09519358665564223, + "loss/policy_avg": 0.14491388201713562, + "lr": 9.366053169734152e-06, + "objective/entropy": -104.99143981933594, + "objective/kl": 40.18424987792969, + "objective/non_score_reward": -2.0092127323150635, + "objective/rlhf_reward": -10.036850929260254, + "objective/scores": -0.5, + "policy/approxkl_avg": 92.04847717285156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.61328125, + "step": 992, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000312089920044 + }, + { + "episode": 15904, + "epoch": 0.09528945129477179, + "loss/policy_avg": 0.14479607343673706, + "lr": 9.365414110429449e-06, + "objective/entropy": 76.6357192993164, + "objective/kl": 36.01222229003906, + "objective/non_score_reward": -1.800611138343811, + "objective/rlhf_reward": -7.202444434165955, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.827505111694336, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7421875, + "step": 993, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000792980194092 + }, + { + "episode": 15920, + "epoch": 0.09538531593390133, + "loss/policy_avg": 0.1194826140999794, + "lr": 9.364775051124744e-06, + "objective/entropy": -207.61534118652344, + "objective/kl": 43.327735900878906, + "objective/non_score_reward": -2.1663870811462402, + "objective/rlhf_reward": -4.265548324584961, + "objective/scores": 1.1, + "policy/approxkl_avg": 9.786656379699707, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.76953125, + "step": 994, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000849723815918 + }, + { + "episode": 15936, + "epoch": 0.09548118057303089, + "loss/policy_avg": 0.219602569937706, + "lr": 9.364135991820041e-06, + "objective/entropy": -114.63105773925781, + "objective/kl": 41.43864822387695, + "objective/non_score_reward": -2.071932315826416, + "objective/rlhf_reward": -6.837131480784759, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 7.255519866943359, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.728515625, + "step": 995, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9999138116836548 + }, + { + "episode": 15952, + "epoch": 0.09557704521216043, + "loss/policy_avg": -0.04379921406507492, + "lr": 9.363496932515338e-06, + "objective/entropy": -152.91604614257812, + "objective/kl": 24.346500396728516, + "objective/non_score_reward": -1.2173250913619995, + "objective/rlhf_reward": -4.869300127029419, + "objective/scores": 0.0, + "policy/approxkl_avg": 28.464168548583984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.578125, + "step": 996, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0003364086151123 + }, + { + "episode": 15968, + "epoch": 0.09567290985128998, + "loss/policy_avg": 0.6855503916740417, + "lr": 9.362857873210635e-06, + "objective/entropy": -251.57192993164062, + "objective/kl": 38.284061431884766, + "objective/non_score_reward": -1.9142030477523804, + "objective/rlhf_reward": -7.6568121910095215, + "objective/scores": 0.0, + "policy/approxkl_avg": 28.274316787719727, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.498046875, + "step": 997, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001492977142334 + }, + { + "episode": 15984, + "epoch": 0.09576877449041953, + "loss/policy_avg": 0.23039552569389343, + "lr": 9.362218813905932e-06, + "objective/entropy": -207.974609375, + "objective/kl": 28.3123779296875, + "objective/non_score_reward": -1.415618896484375, + "objective/rlhf_reward": -4.146703624519047, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 12.781830787658691, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.572265625, + "step": 998, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9994380474090576 + }, + { + "episode": 16000, + "epoch": 0.09586463912954908, + "loss/policy_avg": 0.8329517841339111, + "lr": 9.361579754601227e-06, + "objective/entropy": -144.7649688720703, + "objective/kl": 42.098541259765625, + "objective/non_score_reward": -2.1049270629882812, + "objective/rlhf_reward": -8.419708490371704, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.3122140169143677, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.75, + "step": 999, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.003618001937866 + }, + { + "episode": 16016, + "epoch": 0.09596050376867862, + "loss/policy_avg": 0.20383863151073456, + "lr": 9.360940695296524e-06, + "objective/entropy": -138.388671875, + "objective/kl": 35.74339294433594, + "objective/non_score_reward": -1.7871696949005127, + "objective/rlhf_reward": -7.148679137229919, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.984969139099121, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 1000, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974277019500732 + }, + { + "episode": 16032, + "epoch": 0.09605636840780818, + "loss/policy_avg": 0.18677181005477905, + "lr": 9.36030163599182e-06, + "objective/entropy": -179.36773681640625, + "objective/kl": 37.302913665771484, + "objective/non_score_reward": -1.8651458024978638, + "objective/rlhf_reward": -6.101332986091061, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 14.311004638671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.578125, + "step": 1001, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0003609657287598 + }, + { + "episode": 16048, + "epoch": 0.09615223304693772, + "loss/policy_avg": 0.05936926230788231, + "lr": 9.359662576687117e-06, + "objective/entropy": -36.76812744140625, + "objective/kl": 36.659873962402344, + "objective/non_score_reward": -1.832993984222412, + "objective/rlhf_reward": -5.953373649207455, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 34.80658721923828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.662109375, + "step": 1002, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985153675079346 + }, + { + "episode": 16064, + "epoch": 0.09624809768606728, + "loss/policy_avg": 0.12403427064418793, + "lr": 9.359023517382414e-06, + "objective/entropy": 28.4576416015625, + "objective/kl": 47.77678680419922, + "objective/non_score_reward": -2.3888392448425293, + "objective/rlhf_reward": -9.555356621742249, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.657729148864746, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.744140625, + "step": 1003, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000205039978027 + }, + { + "episode": 16080, + "epoch": 0.09634396232519682, + "loss/policy_avg": 0.20492899417877197, + "lr": 9.358384458077711e-06, + "objective/entropy": -11.457590103149414, + "objective/kl": 31.54743766784668, + "objective/non_score_reward": -1.5773718357086182, + "objective/rlhf_reward": -6.309487462043762, + "objective/scores": 0.0, + "policy/approxkl_avg": 14.368186950683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.53125, + "step": 1004, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9959056377410889 + }, + { + "episode": 16096, + "epoch": 0.09643982696432638, + "loss/policy_avg": 1.767996907234192, + "lr": 9.357745398773006e-06, + "objective/entropy": -49.671348571777344, + "objective/kl": 29.435527801513672, + "objective/non_score_reward": -1.4717764854431152, + "objective/rlhf_reward": -1.4871057033538815, + "objective/scores": 1.1, + "policy/approxkl_avg": 32.49064636230469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619140625, + "step": 1005, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985777139663696 + }, + { + "episode": 16112, + "epoch": 0.09653569160345592, + "loss/policy_avg": 0.26653915643692017, + "lr": 9.357106339468303e-06, + "objective/entropy": -37.01216125488281, + "objective/kl": 47.01079559326172, + "objective/non_score_reward": -2.3505396842956543, + "objective/rlhf_reward": -11.402158737182617, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.7240042686462402, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.724609375, + "step": 1006, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9991438388824463 + }, + { + "episode": 16128, + "epoch": 0.09663155624258547, + "loss/policy_avg": 0.542022168636322, + "lr": 9.3564672801636e-06, + "objective/entropy": -220.16644287109375, + "objective/kl": 40.93455505371094, + "objective/non_score_reward": -2.0467278957366943, + "objective/rlhf_reward": -10.186911582946777, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.69597625732422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.666015625, + "step": 1007, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.996063232421875 + }, + { + "episode": 16144, + "epoch": 0.09672742088171501, + "loss/policy_avg": 0.5151812434196472, + "lr": 9.355828220858897e-06, + "objective/entropy": -98.67259216308594, + "objective/kl": 42.02862548828125, + "objective/non_score_reward": -2.101431369781494, + "objective/rlhf_reward": -10.405725479125977, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.78181838989258, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.54296875, + "step": 1008, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996504783630371 + }, + { + "episode": 16160, + "epoch": 0.09682328552084457, + "loss/policy_avg": 0.12348058819770813, + "lr": 9.355189161554194e-06, + "objective/entropy": -156.9423828125, + "objective/kl": 39.4995002746582, + "objective/non_score_reward": -1.9749749898910522, + "objective/rlhf_reward": -6.238040452421295, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 38.42127227783203, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.73046875, + "step": 1009, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0012216567993164 + }, + { + "episode": 16176, + "epoch": 0.09691915015997411, + "loss/policy_avg": 0.11895422637462616, + "lr": 9.35455010224949e-06, + "objective/entropy": -30.613723754882812, + "objective/kl": 25.980791091918945, + "objective/non_score_reward": -1.299039602279663, + "objective/rlhf_reward": -5.196158349514008, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.429828643798828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.564453125, + "step": 1010, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9955189228057861 + }, + { + "episode": 16192, + "epoch": 0.09701501479910367, + "loss/policy_avg": 0.5678446888923645, + "lr": 9.353911042944786e-06, + "objective/entropy": -83.34064483642578, + "objective/kl": 42.09562683105469, + "objective/non_score_reward": -2.10478138923645, + "objective/rlhf_reward": -6.296419443861518, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 72.57111358642578, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.63671875, + "step": 1011, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999969244003296 + }, + { + "episode": 16208, + "epoch": 0.09711087943823321, + "loss/policy_avg": 0.6819720268249512, + "lr": 9.353271983640083e-06, + "objective/entropy": -169.74818420410156, + "objective/kl": 30.492748260498047, + "objective/non_score_reward": -1.5246374607086182, + "objective/rlhf_reward": -8.098549842834473, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.028923988342285, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.677734375, + "step": 1012, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993962049484253 + }, + { + "episode": 16224, + "epoch": 0.09720674407736277, + "loss/policy_avg": 0.007742304354906082, + "lr": 9.352632924335378e-06, + "objective/entropy": -207.97596740722656, + "objective/kl": 37.429107666015625, + "objective/non_score_reward": -1.871455430984497, + "objective/rlhf_reward": -6.160308990508241, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 54.85210037231445, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.619140625, + "step": 1013, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997692108154297 + }, + { + "episode": 16240, + "epoch": 0.09730260871649231, + "loss/policy_avg": 0.33564040064811707, + "lr": 9.351993865030675e-06, + "objective/entropy": -200.87437438964844, + "objective/kl": 32.79878616333008, + "objective/non_score_reward": -1.6399391889572144, + "objective/rlhf_reward": -5.200507008765621, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 92.10453796386719, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.701171875, + "step": 1014, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9954192638397217 + }, + { + "episode": 16256, + "epoch": 0.09739847335562186, + "loss/policy_avg": 0.537328839302063, + "lr": 9.351354805725972e-06, + "objective/entropy": -205.0013427734375, + "objective/kl": 44.42485809326172, + "objective/non_score_reward": -2.221242904663086, + "objective/rlhf_reward": -8.884971499443054, + "objective/scores": 0.0, + "policy/approxkl_avg": 12.897591590881348, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.708984375, + "step": 1015, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989149570465088 + }, + { + "episode": 16272, + "epoch": 0.0974943379947514, + "loss/policy_avg": 1.2686612606048584, + "lr": 9.350715746421269e-06, + "objective/entropy": -87.46932983398438, + "objective/kl": 48.91461181640625, + "objective/non_score_reward": -2.445730686187744, + "objective/rlhf_reward": -8.404320695487362, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 143.83013916015625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.552734375, + "step": 1016, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974641799926758 + }, + { + "episode": 16288, + "epoch": 0.09759020263388096, + "loss/policy_avg": 0.047659993171691895, + "lr": 9.350076687116566e-06, + "objective/entropy": -119.82821655273438, + "objective/kl": 42.86351776123047, + "objective/non_score_reward": -2.1431756019592285, + "objective/rlhf_reward": -10.572702407836914, + "objective/scores": -0.5, + "policy/approxkl_avg": 45.73783874511719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73046875, + "step": 1017, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993259906768799 + }, + { + "episode": 16304, + "epoch": 0.0976860672730105, + "loss/policy_avg": 0.13739565014839172, + "lr": 9.34943762781186e-06, + "objective/entropy": -191.99688720703125, + "objective/kl": 25.849380493164062, + "objective/non_score_reward": -1.2924690246582031, + "objective/rlhf_reward": -3.8282405047709043, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 33.075462341308594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.546875, + "step": 1018, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9983313083648682 + }, + { + "episode": 16320, + "epoch": 0.09778193191214006, + "loss/policy_avg": 0.2921428680419922, + "lr": 9.348798568507158e-06, + "objective/entropy": 22.923290252685547, + "objective/kl": 40.48946762084961, + "objective/non_score_reward": -2.0244734287261963, + "objective/rlhf_reward": -8.097893476486206, + "objective/scores": 0.0, + "policy/approxkl_avg": 45.973087310791016, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.814453125, + "step": 1019, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997511386871338 + }, + { + "episode": 16336, + "epoch": 0.0978777965512696, + "loss/policy_avg": 0.49925512075424194, + "lr": 9.348159509202455e-06, + "objective/entropy": -182.9492950439453, + "objective/kl": 38.02376937866211, + "objective/non_score_reward": -1.9011883735656738, + "objective/rlhf_reward": -9.604753494262695, + "objective/scores": -0.5, + "policy/approxkl_avg": 93.45957946777344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.75, + "step": 1020, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0008862018585205 + }, + { + "episode": 16352, + "epoch": 0.09797366119039916, + "loss/policy_avg": 0.39709025621414185, + "lr": 9.347520449897751e-06, + "objective/entropy": -71.1256332397461, + "objective/kl": 51.33026123046875, + "objective/non_score_reward": -2.5665130615234375, + "objective/rlhf_reward": -10.266052007675171, + "objective/scores": 0.0, + "policy/approxkl_avg": 94.9317626953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6171875, + "step": 1021, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.998081922531128 + }, + { + "episode": 16368, + "epoch": 0.0980695258295287, + "loss/policy_avg": 0.46625053882598877, + "lr": 9.346881390593048e-06, + "objective/entropy": -37.20600891113281, + "objective/kl": 31.9682674407959, + "objective/non_score_reward": -1.598413348197937, + "objective/rlhf_reward": -4.660320059458414, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 43.983985900878906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66015625, + "step": 1022, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.999427318572998 + }, + { + "episode": 16384, + "epoch": 0.09816539046865826, + "loss/policy_avg": 0.8187178373336792, + "lr": 9.346242331288345e-06, + "objective/entropy": -40.951171875, + "objective/kl": 42.537940979003906, + "objective/non_score_reward": -2.126896858215332, + "objective/rlhf_reward": -6.90346816546114, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 7.384176254272461, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.568359375, + "step": 1023, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980034828186035 + }, + { + "episode": 16400, + "epoch": 0.0982612551077878, + "loss/policy_avg": -0.34478098154067993, + "lr": 9.34560327198364e-06, + "objective/entropy": -103.81099700927734, + "objective/kl": 46.633453369140625, + "objective/non_score_reward": -2.3316726684570312, + "objective/rlhf_reward": -11.326690673828125, + "objective/scores": -0.5, + "policy/approxkl_avg": 73.44123840332031, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.767578125, + "step": 1024, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0072782039642334 + }, + { + "episode": 16416, + "epoch": 0.09835711974691735, + "loss/policy_avg": -0.0066276490688323975, + "lr": 9.344964212678937e-06, + "objective/entropy": -134.967041015625, + "objective/kl": 39.100215911865234, + "objective/non_score_reward": -1.9550105333328247, + "objective/rlhf_reward": -7.8200424909591675, + "objective/scores": 0.0, + "policy/approxkl_avg": 47.77195358276367, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6328125, + "step": 1025, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981870651245117 + }, + { + "episode": 16432, + "epoch": 0.0984529843860469, + "loss/policy_avg": 1.3834272623062134, + "lr": 9.344325153374234e-06, + "objective/entropy": -135.8179473876953, + "objective/kl": 41.999996185302734, + "objective/non_score_reward": -2.0999999046325684, + "objective/rlhf_reward": -6.666666285196939, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 39.04522705078125, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.529296875, + "step": 1026, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999448537826538 + }, + { + "episode": 16448, + "epoch": 0.09854884902517645, + "loss/policy_avg": 0.24614977836608887, + "lr": 9.343686094069531e-06, + "objective/entropy": -107.9916000366211, + "objective/kl": 27.381237030029297, + "objective/non_score_reward": -1.3690619468688965, + "objective/rlhf_reward": -1.076247429847717, + "objective/scores": 1.1, + "policy/approxkl_avg": 26.950410842895508, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619140625, + "step": 1027, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998441457748413 + }, + { + "episode": 16464, + "epoch": 0.098644713664306, + "loss/policy_avg": 9.17509651184082, + "lr": 9.343047034764828e-06, + "objective/entropy": -128.18038940429688, + "objective/kl": 46.282501220703125, + "objective/non_score_reward": -2.3141250610351562, + "objective/rlhf_reward": -7.914864709883361, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 8.609909057617188, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.69921875, + "step": 1028, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986345767974854 + }, + { + "episode": 16480, + "epoch": 0.09874057830343555, + "loss/policy_avg": -0.20138072967529297, + "lr": 9.342407975460123e-06, + "objective/entropy": -157.4423370361328, + "objective/kl": 52.01014709472656, + "objective/non_score_reward": -2.6005072593688965, + "objective/rlhf_reward": -10.402029156684875, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.244065284729004, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.662109375, + "step": 1029, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999929666519165 + }, + { + "episode": 16496, + "epoch": 0.09883644294256509, + "loss/policy_avg": 0.2195969671010971, + "lr": 9.34176891615542e-06, + "objective/entropy": -189.69271850585938, + "objective/kl": 36.75048065185547, + "objective/non_score_reward": -1.8375239372253418, + "objective/rlhf_reward": -5.834324085506138, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 82.3933334350586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.556640625, + "step": 1030, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998250961303711 + }, + { + "episode": 16512, + "epoch": 0.09893230758169465, + "loss/policy_avg": 0.06882472336292267, + "lr": 9.341129856850717e-06, + "objective/entropy": -217.81845092773438, + "objective/kl": 41.54722595214844, + "objective/non_score_reward": -2.0773613452911377, + "objective/rlhf_reward": -6.362034092621739, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 8.015896797180176, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.556640625, + "step": 1031, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998700499534607 + }, + { + "episode": 16528, + "epoch": 0.09902817222082419, + "loss/policy_avg": 0.440207839012146, + "lr": 9.340490797546014e-06, + "objective/entropy": -58.1538200378418, + "objective/kl": 42.825157165527344, + "objective/non_score_reward": -2.1412577629089355, + "objective/rlhf_reward": -8.565030932426453, + "objective/scores": 0.0, + "policy/approxkl_avg": 36.369468688964844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.89453125, + "step": 1032, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9995689392089844 + }, + { + "episode": 16544, + "epoch": 0.09912403685995375, + "loss/policy_avg": 0.21779778599739075, + "lr": 9.33985173824131e-06, + "objective/entropy": -82.06159973144531, + "objective/kl": 31.33786392211914, + "objective/non_score_reward": -1.5668931007385254, + "objective/rlhf_reward": -3.86757276058197, + "objective/scores": 0.6, + "policy/approxkl_avg": 24.193450927734375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.755859375, + "step": 1033, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.99662184715271 + }, + { + "episode": 16560, + "epoch": 0.09921990149908329, + "loss/policy_avg": 0.2209034264087677, + "lr": 9.339212678936606e-06, + "objective/entropy": -143.8715057373047, + "objective/kl": 28.651321411132812, + "objective/non_score_reward": -1.4325661659240723, + "objective/rlhf_reward": -5.7302645444869995, + "objective/scores": 0.0, + "policy/approxkl_avg": 20.247289657592773, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.662109375, + "step": 1034, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983750581741333 + }, + { + "episode": 16576, + "epoch": 0.09931576613821284, + "loss/policy_avg": 0.6013627052307129, + "lr": 9.338573619631903e-06, + "objective/entropy": -127.35606384277344, + "objective/kl": 42.513511657714844, + "objective/non_score_reward": -2.125675678253174, + "objective/rlhf_reward": -10.502701759338379, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.316253662109375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6015625, + "step": 1035, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9965988397598267 + }, + { + "episode": 16592, + "epoch": 0.09941163077734239, + "loss/policy_avg": 0.025708666071295738, + "lr": 9.3379345603272e-06, + "objective/entropy": -131.59385681152344, + "objective/kl": 43.27368927001953, + "objective/non_score_reward": -2.163684606552124, + "objective/rlhf_reward": -7.138966524394688, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 97.55415344238281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.50390625, + "step": 1036, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987868070602417 + }, + { + "episode": 16608, + "epoch": 0.09950749541647194, + "loss/policy_avg": 0.11126343160867691, + "lr": 9.337295501022495e-06, + "objective/entropy": -214.27691650390625, + "objective/kl": 32.323978424072266, + "objective/non_score_reward": -1.616199016571045, + "objective/rlhf_reward": -6.46479606628418, + "objective/scores": 0.0, + "policy/approxkl_avg": 79.82876586914062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 1037, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.999455451965332 + }, + { + "episode": 16624, + "epoch": 0.0996033600556015, + "loss/policy_avg": 1.7602635622024536, + "lr": 9.336656441717792e-06, + "objective/entropy": -178.519775390625, + "objective/kl": 34.503726959228516, + "objective/non_score_reward": -1.7251863479614258, + "objective/rlhf_reward": -5.419792893345713, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 3.1007509231567383, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.63671875, + "step": 1038, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0009100437164307 + }, + { + "episode": 16640, + "epoch": 0.09969922469473104, + "loss/policy_avg": -0.03113560751080513, + "lr": 9.336017382413088e-06, + "objective/entropy": -117.89727783203125, + "objective/kl": 36.79457092285156, + "objective/non_score_reward": -1.839728832244873, + "objective/rlhf_reward": -7.358915090560913, + "objective/scores": 0.0, + "policy/approxkl_avg": 38.075477600097656, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.560546875, + "step": 1039, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001939296722412 + }, + { + "episode": 16656, + "epoch": 0.0997950893338606, + "loss/policy_avg": 0.5486714839935303, + "lr": 9.335378323108385e-06, + "objective/entropy": -87.73934936523438, + "objective/kl": 35.572235107421875, + "objective/non_score_reward": -1.7786118984222412, + "objective/rlhf_reward": -5.788934741049928, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 14.469789505004883, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.505859375, + "step": 1040, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.996793508529663 + }, + { + "episode": 16672, + "epoch": 0.09989095397299014, + "loss/policy_avg": 1.5353370904922485, + "lr": 9.334739263803682e-06, + "objective/entropy": -231.09872436523438, + "objective/kl": 41.20832061767578, + "objective/non_score_reward": -2.0604159832000732, + "objective/rlhf_reward": -6.863061764327389, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 12.170169830322266, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.611328125, + "step": 1041, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989726543426514 + }, + { + "episode": 16688, + "epoch": 0.0999868186121197, + "loss/policy_avg": 0.567713737487793, + "lr": 9.334100204498977e-06, + "objective/entropy": -241.43797302246094, + "objective/kl": 33.448394775390625, + "objective/non_score_reward": -1.6724196672439575, + "objective/rlhf_reward": -6.689678728580475, + "objective/scores": 0.0, + "policy/approxkl_avg": 21.26333999633789, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6875, + "step": 1042, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985414743423462 + }, + { + "episode": 16704, + "epoch": 0.10008268325124924, + "loss/policy_avg": 3.0933616161346436, + "lr": 9.333461145194274e-06, + "objective/entropy": -97.33987426757812, + "objective/kl": 43.729984283447266, + "objective/non_score_reward": -2.1864991188049316, + "objective/rlhf_reward": -10.745996475219727, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.958702087402344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.73046875, + "step": 1043, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002870559692383 + }, + { + "episode": 16720, + "epoch": 0.10017854789037879, + "loss/policy_avg": 0.1494733989238739, + "lr": 9.332822085889571e-06, + "objective/entropy": -178.5977783203125, + "objective/kl": 34.75993728637695, + "objective/non_score_reward": -1.7379969358444214, + "objective/rlhf_reward": -8.951988220214844, + "objective/scores": -0.5, + "policy/approxkl_avg": 47.86305236816406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.759765625, + "step": 1044, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000399351119995 + }, + { + "episode": 16736, + "epoch": 0.10027441252950833, + "loss/policy_avg": 0.31734079122543335, + "lr": 9.332183026584868e-06, + "objective/entropy": -128.50457763671875, + "objective/kl": 40.146453857421875, + "objective/non_score_reward": -2.0073227882385254, + "objective/rlhf_reward": -5.62929151058197, + "objective/scores": 0.6, + "policy/approxkl_avg": 11.523036003112793, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.712890625, + "step": 1045, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990044832229614 + }, + { + "episode": 16752, + "epoch": 0.10037027716863789, + "loss/policy_avg": 0.6753818392753601, + "lr": 9.331543967280165e-06, + "objective/entropy": -21.29187774658203, + "objective/kl": 36.3908576965332, + "objective/non_score_reward": -1.819542646408081, + "objective/rlhf_reward": -5.674050811592656, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 84.15306091308594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.736328125, + "step": 1046, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979207515716553 + }, + { + "episode": 16768, + "epoch": 0.10046614180776743, + "loss/policy_avg": 1.1861382722854614, + "lr": 9.330904907975462e-06, + "objective/entropy": -138.18101501464844, + "objective/kl": 44.877899169921875, + "objective/non_score_reward": -2.2438952922821045, + "objective/rlhf_reward": -6.051861916424009, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 3.5224761962890625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62890625, + "step": 1047, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0006613731384277 + }, + { + "episode": 16784, + "epoch": 0.10056200644689699, + "loss/policy_avg": 0.03651944920420647, + "lr": 9.330265848670757e-06, + "objective/entropy": -175.96896362304688, + "objective/kl": 31.624242782592773, + "objective/non_score_reward": -1.581212043762207, + "objective/rlhf_reward": -3.401129518390867, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 26.37567901611328, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.681640625, + "step": 1048, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9982459545135498 + }, + { + "episode": 16800, + "epoch": 0.10065787108602653, + "loss/policy_avg": 0.308625191450119, + "lr": 9.329626789366054e-06, + "objective/entropy": -192.007568359375, + "objective/kl": 32.40314865112305, + "objective/non_score_reward": -1.6201574802398682, + "objective/rlhf_reward": -4.533218692021306, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 66.30451965332031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.548828125, + "step": 1049, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988420009613037 + }, + { + "episode": 16816, + "epoch": 0.10075373572515609, + "loss/policy_avg": 0.09003337472677231, + "lr": 9.32898773006135e-06, + "objective/entropy": -196.39547729492188, + "objective/kl": 37.95484161376953, + "objective/non_score_reward": -1.8977420330047607, + "objective/rlhf_reward": -7.590968012809753, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.447707176208496, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.685546875, + "step": 1050, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001563787460327 + }, + { + "episode": 16832, + "epoch": 0.10084960036428563, + "loss/policy_avg": -0.000698484480381012, + "lr": 9.328348670756648e-06, + "objective/entropy": -210.3636474609375, + "objective/kl": 35.0748291015625, + "objective/non_score_reward": -1.7537415027618408, + "objective/rlhf_reward": -7.014966368675232, + "objective/scores": 0.0, + "policy/approxkl_avg": 38.02912139892578, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.623046875, + "step": 1051, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9968023300170898 + }, + { + "episode": 16848, + "epoch": 0.10094546500341518, + "loss/policy_avg": 0.722516655921936, + "lr": 9.327709611451944e-06, + "objective/entropy": -217.8343505859375, + "objective/kl": 32.70154571533203, + "objective/non_score_reward": -1.6350772380828857, + "objective/rlhf_reward": -6.540309190750122, + "objective/scores": 0.0, + "policy/approxkl_avg": 23.104328155517578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.75, + "step": 1052, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001298666000366 + }, + { + "episode": 16864, + "epoch": 0.10104132964254472, + "loss/policy_avg": 0.06484949588775635, + "lr": 9.32707055214724e-06, + "objective/entropy": -160.59938049316406, + "objective/kl": 30.30784034729004, + "objective/non_score_reward": -1.5153919458389282, + "objective/rlhf_reward": -4.610969643206939, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 26.352643966674805, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.734375, + "step": 1053, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9995694160461426 + }, + { + "episode": 16880, + "epoch": 0.10113719428167428, + "loss/policy_avg": 0.38344231247901917, + "lr": 9.326431492842537e-06, + "objective/entropy": -121.33949279785156, + "objective/kl": 48.2208251953125, + "objective/non_score_reward": -2.411041498184204, + "objective/rlhf_reward": -9.644165873527527, + "objective/scores": 0.0, + "policy/approxkl_avg": 32.27105712890625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.724609375, + "step": 1054, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0021653175354004 + }, + { + "episode": 16896, + "epoch": 0.10123305892080382, + "loss/policy_avg": 0.1565134972333908, + "lr": 9.325792433537833e-06, + "objective/entropy": -244.3416290283203, + "objective/kl": 30.178707122802734, + "objective/non_score_reward": -1.5089352130889893, + "objective/rlhf_reward": -6.035740971565247, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.869574546813965, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4970703125, + "step": 1055, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987046718597412 + }, + { + "episode": 16912, + "epoch": 0.10132892355993338, + "loss/policy_avg": 0.5662046670913696, + "lr": 9.325153374233129e-06, + "objective/entropy": -40.44217300415039, + "objective/kl": 41.73125457763672, + "objective/non_score_reward": -2.0865628719329834, + "objective/rlhf_reward": -10.346250534057617, + "objective/scores": -0.5, + "policy/approxkl_avg": 66.33694458007812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6015625, + "step": 1056, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9967844486236572 + }, + { + "episode": 16928, + "epoch": 0.10142478819906292, + "loss/policy_avg": 0.17891189455986023, + "lr": 9.324514314928425e-06, + "objective/entropy": -195.00128173828125, + "objective/kl": 31.191688537597656, + "objective/non_score_reward": -1.5595844984054565, + "objective/rlhf_reward": -8.238338470458984, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.837324142456055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.724609375, + "step": 1057, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9974877834320068 + }, + { + "episode": 16944, + "epoch": 0.10152065283819248, + "loss/policy_avg": 0.2507179379463196, + "lr": 9.323875255623722e-06, + "objective/entropy": -244.8971710205078, + "objective/kl": 30.04808235168457, + "objective/non_score_reward": -1.5024040937423706, + "objective/rlhf_reward": -8.00961685180664, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.368750810623169, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.59765625, + "step": 1058, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989590644836426 + }, + { + "episode": 16960, + "epoch": 0.10161651747732202, + "loss/policy_avg": 0.31457293033599854, + "lr": 9.32323619631902e-06, + "objective/entropy": -103.91079711914062, + "objective/kl": 34.75779724121094, + "objective/non_score_reward": -1.7378900051116943, + "objective/rlhf_reward": -6.951560378074646, + "objective/scores": 0.0, + "policy/approxkl_avg": 30.09579086303711, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.630859375, + "step": 1059, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998429298400879 + }, + { + "episode": 16976, + "epoch": 0.10171238211645157, + "loss/policy_avg": 0.21612216532230377, + "lr": 9.322597137014316e-06, + "objective/entropy": -167.63034057617188, + "objective/kl": 29.790748596191406, + "objective/non_score_reward": -1.4895374774932861, + "objective/rlhf_reward": -1.5581499099731442, + "objective/scores": 1.1, + "policy/approxkl_avg": 38.86473083496094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.64453125, + "step": 1060, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998309850692749 + }, + { + "episode": 16992, + "epoch": 0.10180824675558112, + "loss/policy_avg": 0.3785962462425232, + "lr": 9.321958077709611e-06, + "objective/entropy": -54.48414611816406, + "objective/kl": 32.687042236328125, + "objective/non_score_reward": -1.6343519687652588, + "objective/rlhf_reward": -8.537407875061035, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.902872085571289, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8125, + "step": 1061, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9995067119598389 + }, + { + "episode": 17008, + "epoch": 0.10190411139471067, + "loss/policy_avg": 0.48738372325897217, + "lr": 9.321319018404908e-06, + "objective/entropy": -231.13339233398438, + "objective/kl": 36.09325408935547, + "objective/non_score_reward": -1.804662823677063, + "objective/rlhf_reward": -2.818651413917541, + "objective/scores": 1.1, + "policy/approxkl_avg": 22.55707359313965, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.54296875, + "step": 1062, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000725030899048 + }, + { + "episode": 17024, + "epoch": 0.10199997603384021, + "loss/policy_avg": 0.30610886216163635, + "lr": 9.320679959100205e-06, + "objective/entropy": -261.2393493652344, + "objective/kl": 37.69483184814453, + "objective/non_score_reward": -1.8847413063049316, + "objective/rlhf_reward": -7.538965463638306, + "objective/scores": 0.0, + "policy/approxkl_avg": 98.26731872558594, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.619140625, + "step": 1063, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9972697496414185 + }, + { + "episode": 17040, + "epoch": 0.10209584067296977, + "loss/policy_avg": 0.40209701657295227, + "lr": 9.320040899795502e-06, + "objective/entropy": 19.392539978027344, + "objective/kl": 29.5678653717041, + "objective/non_score_reward": -1.478393316268921, + "objective/rlhf_reward": -5.913573384284973, + "objective/scores": 0.0, + "policy/approxkl_avg": 57.251625061035156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.798828125, + "step": 1064, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998482584953308 + }, + { + "episode": 17056, + "epoch": 0.10219170531209931, + "loss/policy_avg": 0.41092318296432495, + "lr": 9.319401840490799e-06, + "objective/entropy": -85.32174682617188, + "objective/kl": 45.41130828857422, + "objective/non_score_reward": -2.2705655097961426, + "objective/rlhf_reward": -7.257433290752481, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 101.7055435180664, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6796875, + "step": 1065, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9998722076416016 + }, + { + "episode": 17072, + "epoch": 0.10228756995122887, + "loss/policy_avg": 0.5468560457229614, + "lr": 9.318762781186094e-06, + "objective/entropy": -115.47386169433594, + "objective/kl": 27.086318969726562, + "objective/non_score_reward": -1.354315996170044, + "objective/rlhf_reward": -4.075628271608978, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 10.811802864074707, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5234375, + "step": 1066, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.003807783126831 + }, + { + "episode": 17088, + "epoch": 0.10238343459035841, + "loss/policy_avg": 0.5827947854995728, + "lr": 9.318123721881391e-06, + "objective/entropy": -106.19446563720703, + "objective/kl": 49.741939544677734, + "objective/non_score_reward": -2.4870970249176025, + "objective/rlhf_reward": -9.948387861251831, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.485939979553223, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8046875, + "step": 1067, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9999773502349854 + }, + { + "episode": 17104, + "epoch": 0.10247929922948797, + "loss/policy_avg": -0.02426096983253956, + "lr": 9.317484662576688e-06, + "objective/entropy": -111.4180908203125, + "objective/kl": 43.29094696044922, + "objective/non_score_reward": -2.1645474433898926, + "objective/rlhf_reward": -10.65818977355957, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.30387306213379, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.78515625, + "step": 1068, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997377395629883 + }, + { + "episode": 17120, + "epoch": 0.10257516386861751, + "loss/policy_avg": 0.2527628242969513, + "lr": 9.316845603271985e-06, + "objective/entropy": 12.093185424804688, + "objective/kl": 60.302955627441406, + "objective/non_score_reward": -3.0151476860046387, + "objective/rlhf_reward": -12.060590744018555, + "objective/scores": 0.0, + "policy/approxkl_avg": 77.34808349609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6171875, + "step": 1069, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994676113128662 + }, + { + "episode": 17136, + "epoch": 0.10267102850774706, + "loss/policy_avg": 0.24966003000736237, + "lr": 9.316206543967282e-06, + "objective/entropy": -184.75172424316406, + "objective/kl": 44.51036071777344, + "objective/non_score_reward": -2.225517988204956, + "objective/rlhf_reward": -10.902071952819824, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.141345739364624, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.587890625, + "step": 1070, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0024020671844482 + }, + { + "episode": 17152, + "epoch": 0.1027668931468766, + "loss/policy_avg": 0.20333248376846313, + "lr": 9.315567484662578e-06, + "objective/entropy": -243.2203369140625, + "objective/kl": 44.50590515136719, + "objective/non_score_reward": -2.225295066833496, + "objective/rlhf_reward": -10.901180267333984, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.091179370880127, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69921875, + "step": 1071, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984550476074219 + }, + { + "episode": 17168, + "epoch": 0.10286275778600616, + "loss/policy_avg": 0.024163365364074707, + "lr": 9.314928425357874e-06, + "objective/entropy": -234.9069366455078, + "objective/kl": 32.5240478515625, + "objective/non_score_reward": -1.6262023448944092, + "objective/rlhf_reward": -8.504809379577637, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.942357063293457, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.560546875, + "step": 1072, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001004219055176 + }, + { + "episode": 17184, + "epoch": 0.1029586224251357, + "loss/policy_avg": 0.12604910135269165, + "lr": 9.31428936605317e-06, + "objective/entropy": -230.594482421875, + "objective/kl": 24.642736434936523, + "objective/non_score_reward": -1.2321367263793945, + "objective/rlhf_reward": -3.266687756002532, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 59.71365737915039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7265625, + "step": 1073, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.994974136352539 + }, + { + "episode": 17200, + "epoch": 0.10305448706426526, + "loss/policy_avg": 0.28256043791770935, + "lr": 9.313650306748467e-06, + "objective/entropy": -240.85040283203125, + "objective/kl": 41.427154541015625, + "objective/non_score_reward": -2.0713577270507812, + "objective/rlhf_reward": -8.285430908203125, + "objective/scores": 0.0, + "policy/approxkl_avg": 44.274871826171875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.701171875, + "step": 1074, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997580647468567 + }, + { + "episode": 17216, + "epoch": 0.1031503517033948, + "loss/policy_avg": -0.2876598834991455, + "lr": 9.313011247443764e-06, + "objective/entropy": -127.2560806274414, + "objective/kl": 43.468505859375, + "objective/non_score_reward": -2.1734251976013184, + "objective/rlhf_reward": -8.693701386451721, + "objective/scores": 0.0, + "policy/approxkl_avg": 98.98490905761719, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6953125, + "step": 1075, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0023903846740723 + }, + { + "episode": 17232, + "epoch": 0.10324621634252436, + "loss/policy_avg": -0.13751818239688873, + "lr": 9.312372188139061e-06, + "objective/entropy": -214.5344696044922, + "objective/kl": 25.465530395507812, + "objective/non_score_reward": -1.2732765674591064, + "objective/rlhf_reward": -5.093105912208557, + "objective/scores": 0.0, + "policy/approxkl_avg": 63.27098846435547, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.818359375, + "step": 1076, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0017144680023193 + }, + { + "episode": 17248, + "epoch": 0.1033420809816539, + "loss/policy_avg": 0.08209620416164398, + "lr": 9.311733128834356e-06, + "objective/entropy": -83.16780853271484, + "objective/kl": 36.76767349243164, + "objective/non_score_reward": -1.838383674621582, + "objective/rlhf_reward": -9.353534698486328, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.638032913208008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.71484375, + "step": 1077, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9992196559906006 + }, + { + "episode": 17264, + "epoch": 0.10343794562078346, + "loss/policy_avg": 0.905781626701355, + "lr": 9.311094069529653e-06, + "objective/entropy": -64.38595581054688, + "objective/kl": 39.2518310546875, + "objective/non_score_reward": -1.9625916481018066, + "objective/rlhf_reward": -9.850366592407227, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.072268486022949, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7265625, + "step": 1078, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9996236562728882 + }, + { + "episode": 17280, + "epoch": 0.103533810259913, + "loss/policy_avg": 0.20488634705543518, + "lr": 9.310455010224948e-06, + "objective/entropy": -197.39642333984375, + "objective/kl": 36.854251861572266, + "objective/non_score_reward": -1.8427127599716187, + "objective/rlhf_reward": -7.370850682258606, + "objective/scores": 0.0, + "policy/approxkl_avg": 200.13021850585938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.736328125, + "step": 1079, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001244068145752 + }, + { + "episode": 17296, + "epoch": 0.10362967489904255, + "loss/policy_avg": 0.5982025861740112, + "lr": 9.309815950920245e-06, + "objective/entropy": -195.03408813476562, + "objective/kl": 48.12640380859375, + "objective/non_score_reward": -2.406320095062256, + "objective/rlhf_reward": -9.625280857086182, + "objective/scores": 0.0, + "policy/approxkl_avg": 95.50926208496094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.599609375, + "step": 1080, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9973835945129395 + }, + { + "episode": 17312, + "epoch": 0.1037255395381721, + "loss/policy_avg": 0.16170604526996613, + "lr": 9.309176891615542e-06, + "objective/entropy": -209.88661193847656, + "objective/kl": 35.2683219909668, + "objective/non_score_reward": -1.763416051864624, + "objective/rlhf_reward": -7.053664565086365, + "objective/scores": 0.0, + "policy/approxkl_avg": 26.139625549316406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.671875, + "step": 1081, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996190071105957 + }, + { + "episode": 17328, + "epoch": 0.10382140417730165, + "loss/policy_avg": 1.1378875970840454, + "lr": 9.308537832310839e-06, + "objective/entropy": -221.8079376220703, + "objective/kl": 47.17389678955078, + "objective/non_score_reward": -2.3586950302124023, + "objective/rlhf_reward": -8.056177713958126, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 4.690964698791504, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.677734375, + "step": 1082, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.998779058456421 + }, + { + "episode": 17344, + "epoch": 0.1039172688164312, + "loss/policy_avg": 0.3751685917377472, + "lr": 9.307898773006136e-06, + "objective/entropy": -87.24127197265625, + "objective/kl": 32.1821403503418, + "objective/non_score_reward": -1.6091070175170898, + "objective/rlhf_reward": -6.436428189277649, + "objective/scores": 0.0, + "policy/approxkl_avg": 19.975833892822266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.720703125, + "step": 1083, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9977320432662964 + }, + { + "episode": 17360, + "epoch": 0.10401313345556075, + "loss/policy_avg": 0.10768476128578186, + "lr": 9.307259713701433e-06, + "objective/entropy": -202.04653930664062, + "objective/kl": 41.12028503417969, + "objective/non_score_reward": -2.0560145378112793, + "objective/rlhf_reward": -8.224057674407959, + "objective/scores": 0.0, + "policy/approxkl_avg": 81.79635620117188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.587890625, + "step": 1084, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987776279449463 + }, + { + "episode": 17376, + "epoch": 0.10410899809469029, + "loss/policy_avg": -0.05176592990756035, + "lr": 9.306620654396728e-06, + "objective/entropy": -208.77484130859375, + "objective/kl": 38.498958587646484, + "objective/non_score_reward": -1.92494797706604, + "objective/rlhf_reward": -9.699792861938477, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.74083709716797, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.693359375, + "step": 1085, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.006058692932129 + }, + { + "episode": 17392, + "epoch": 0.10420486273381985, + "loss/policy_avg": 0.28333908319473267, + "lr": 9.305981595092025e-06, + "objective/entropy": -102.09290313720703, + "objective/kl": 50.06310272216797, + "objective/non_score_reward": -2.50315523147583, + "objective/rlhf_reward": -12.01262092590332, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.604026794433594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.599609375, + "step": 1086, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9990627765655518 + }, + { + "episode": 17408, + "epoch": 0.10430072737294939, + "loss/policy_avg": 0.42880111932754517, + "lr": 9.305342535787322e-06, + "objective/entropy": -181.89987182617188, + "objective/kl": 40.60636520385742, + "objective/non_score_reward": -2.030318260192871, + "objective/rlhf_reward": -6.1738618118333175, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 25.25945281982422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 1087, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9990653991699219 + }, + { + "episode": 17424, + "epoch": 0.10439659201207895, + "loss/policy_avg": 0.25624793767929077, + "lr": 9.304703476482619e-06, + "objective/entropy": -110.85409545898438, + "objective/kl": 32.54735565185547, + "objective/non_score_reward": -1.6273677349090576, + "objective/rlhf_reward": -5.198789706429839, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 27.639907836914062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.439453125, + "step": 1088, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995222091674805 + }, + { + "episode": 17440, + "epoch": 0.10449245665120849, + "loss/policy_avg": -0.019633345305919647, + "lr": 9.304064417177915e-06, + "objective/entropy": -125.78160095214844, + "objective/kl": 34.34632110595703, + "objective/non_score_reward": -1.7173161506652832, + "objective/rlhf_reward": -8.869264602661133, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.5311360359191895, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.638671875, + "step": 1089, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9991655349731445 + }, + { + "episode": 17456, + "epoch": 0.10458832129033804, + "loss/policy_avg": 0.3127847909927368, + "lr": 9.30342535787321e-06, + "objective/entropy": -173.8811798095703, + "objective/kl": 26.766096115112305, + "objective/non_score_reward": -1.3383049964904785, + "objective/rlhf_reward": -2.9532195687294003, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.7351056337356567, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7265625, + "step": 1090, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989339113235474 + }, + { + "episode": 17472, + "epoch": 0.10468418592946759, + "loss/policy_avg": 0.005596889182925224, + "lr": 9.302786298568508e-06, + "objective/entropy": -126.20843505859375, + "objective/kl": 43.94227981567383, + "objective/non_score_reward": -2.1971139907836914, + "objective/rlhf_reward": -7.272683942111668, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 1.9898426532745361, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.546875, + "step": 1091, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000331401824951 + }, + { + "episode": 17488, + "epoch": 0.10478005056859714, + "loss/policy_avg": 3.194516658782959, + "lr": 9.302147239263804e-06, + "objective/entropy": -74.95545196533203, + "objective/kl": 26.713520050048828, + "objective/non_score_reward": -1.3356759548187256, + "objective/rlhf_reward": -3.517875249656748, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 12.780288696289062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.619140625, + "step": 1092, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0024166107177734 + }, + { + "episode": 17504, + "epoch": 0.10487591520772668, + "loss/policy_avg": 0.12241563946008682, + "lr": 9.301508179959101e-06, + "objective/entropy": -194.66860961914062, + "objective/kl": 30.9637393951416, + "objective/non_score_reward": -1.548187017440796, + "objective/rlhf_reward": -8.192748069763184, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.120702266693115, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.701171875, + "step": 1093, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0006556510925293 + }, + { + "episode": 17520, + "epoch": 0.10497177984685624, + "loss/policy_avg": -0.11587963998317719, + "lr": 9.300869120654398e-06, + "objective/entropy": -11.867881774902344, + "objective/kl": 36.72633361816406, + "objective/non_score_reward": -1.836316704750061, + "objective/rlhf_reward": -2.9452665805816647, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.738877534866333, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.787109375, + "step": 1094, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986064434051514 + }, + { + "episode": 17536, + "epoch": 0.1050676444859858, + "loss/policy_avg": 0.6495574712753296, + "lr": 9.300230061349695e-06, + "objective/entropy": -19.585662841796875, + "objective/kl": 37.080047607421875, + "objective/non_score_reward": -1.8540027141571045, + "objective/rlhf_reward": -9.416010856628418, + "objective/scores": -0.5, + "policy/approxkl_avg": 20.846208572387695, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.72265625, + "step": 1095, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973466396331787 + }, + { + "episode": 17552, + "epoch": 0.10516350912511534, + "loss/policy_avg": 0.046597689390182495, + "lr": 9.29959100204499e-06, + "objective/entropy": -171.55307006835938, + "objective/kl": 40.582767486572266, + "objective/non_score_reward": -2.0291385650634766, + "objective/rlhf_reward": -6.716553783416748, + "objective/scores": 0.35, + "policy/approxkl_avg": 6.786599636077881, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.759765625, + "step": 1096, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000523090362549 + }, + { + "episode": 17568, + "epoch": 0.10525937376424489, + "loss/policy_avg": 0.21877789497375488, + "lr": 9.298951942740287e-06, + "objective/entropy": -113.61373901367188, + "objective/kl": 28.416501998901367, + "objective/non_score_reward": -1.4208252429962158, + "objective/rlhf_reward": -3.7358895046281173, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 42.627174377441406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.623046875, + "step": 1097, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984509944915771 + }, + { + "episode": 17584, + "epoch": 0.10535523840337443, + "loss/policy_avg": 0.45822641253471375, + "lr": 9.298312883435584e-06, + "objective/entropy": -144.4995574951172, + "objective/kl": 39.54690933227539, + "objective/non_score_reward": -1.9773454666137695, + "objective/rlhf_reward": -6.530779817191464, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 59.44287109375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.66015625, + "step": 1098, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.996052622795105 + }, + { + "episode": 17600, + "epoch": 0.10545110304250399, + "loss/policy_avg": -0.24966418743133545, + "lr": 9.29767382413088e-06, + "objective/entropy": -192.11404418945312, + "objective/kl": 48.76808166503906, + "objective/non_score_reward": -2.438404083251953, + "objective/rlhf_reward": -7.928787822994302, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 3.962144374847412, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.607421875, + "step": 1099, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002513885498047 + }, + { + "episode": 17616, + "epoch": 0.10554696768163353, + "loss/policy_avg": 0.29037731885910034, + "lr": 9.297034764826178e-06, + "objective/entropy": -193.0730743408203, + "objective/kl": 36.51386260986328, + "objective/non_score_reward": -1.825693130493164, + "objective/rlhf_reward": -7.302772641181946, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.697671890258789, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.708984375, + "step": 1100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004220008850098 + }, + { + "episode": 17632, + "epoch": 0.10564283232076309, + "loss/policy_avg": 0.1895323395729065, + "lr": 9.296395705521473e-06, + "objective/entropy": -192.9074249267578, + "objective/kl": 39.492767333984375, + "objective/non_score_reward": -1.9746384620666504, + "objective/rlhf_reward": -7.898553729057312, + "objective/scores": 0.0, + "policy/approxkl_avg": 28.709880828857422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.626953125, + "step": 1101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982495307922363 + }, + { + "episode": 17648, + "epoch": 0.10573869695989263, + "loss/policy_avg": 0.5917953252792358, + "lr": 9.29575664621677e-06, + "objective/entropy": -76.33648681640625, + "objective/kl": 41.15357971191406, + "objective/non_score_reward": -2.0576794147491455, + "objective/rlhf_reward": -8.230717420578003, + "objective/scores": 0.0, + "policy/approxkl_avg": 28.34891128540039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.615234375, + "step": 1102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9937398433685303 + }, + { + "episode": 17664, + "epoch": 0.10583456159902219, + "loss/policy_avg": -0.07402372360229492, + "lr": 9.295117586912065e-06, + "objective/entropy": -178.22756958007812, + "objective/kl": 34.42258834838867, + "objective/non_score_reward": -1.7211294174194336, + "objective/rlhf_reward": -8.884517669677734, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.810836911201477, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.583984375, + "step": 1103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0009865760803223 + }, + { + "episode": 17680, + "epoch": 0.10593042623815173, + "loss/policy_avg": 0.15096347033977509, + "lr": 9.294478527607362e-06, + "objective/entropy": -238.453857421875, + "objective/kl": 40.95924377441406, + "objective/non_score_reward": -2.047962188720703, + "objective/rlhf_reward": -6.45851506392161, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 4.69025993347168, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 1104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000203847885132 + }, + { + "episode": 17696, + "epoch": 0.10602629087728128, + "loss/policy_avg": 0.24375811219215393, + "lr": 9.293839468302659e-06, + "objective/entropy": -189.04649353027344, + "objective/kl": 32.06029510498047, + "objective/non_score_reward": -1.6030148267745972, + "objective/rlhf_reward": -8.412059783935547, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.191177368164062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58984375, + "step": 1105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985370635986328 + }, + { + "episode": 17712, + "epoch": 0.10612215551641083, + "loss/policy_avg": 0.017711302265524864, + "lr": 9.293200408997956e-06, + "objective/entropy": -230.17556762695312, + "objective/kl": 42.93938446044922, + "objective/non_score_reward": -2.1469693183898926, + "objective/rlhf_reward": -8.587876915931702, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8443315029144287, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.640625, + "step": 1106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9995675086975098 + }, + { + "episode": 17728, + "epoch": 0.10621802015554038, + "loss/policy_avg": 0.2855517566204071, + "lr": 9.292561349693252e-06, + "objective/entropy": -121.51898956298828, + "objective/kl": 41.35898208618164, + "objective/non_score_reward": -2.0679492950439453, + "objective/rlhf_reward": -10.271797180175781, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.437127113342285, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.689453125, + "step": 1107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989829063415527 + }, + { + "episode": 17744, + "epoch": 0.10631388479466992, + "loss/policy_avg": 0.2132728397846222, + "lr": 9.29192229038855e-06, + "objective/entropy": -211.87677001953125, + "objective/kl": 35.54936599731445, + "objective/non_score_reward": -1.7774684429168701, + "objective/rlhf_reward": -7.109873652458191, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.873299598693848, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.544921875, + "step": 1108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995219707489014 + }, + { + "episode": 17760, + "epoch": 0.10640974943379948, + "loss/policy_avg": -0.3036063313484192, + "lr": 9.291283231083845e-06, + "objective/entropy": -264.26220703125, + "objective/kl": 32.72498321533203, + "objective/non_score_reward": -1.6362491846084595, + "objective/rlhf_reward": -4.883137231290923, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 3.613276958465576, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.763671875, + "step": 1109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.004819393157959 + }, + { + "episode": 17776, + "epoch": 0.10650561407292902, + "loss/policy_avg": 0.09787774085998535, + "lr": 9.290644171779141e-06, + "objective/entropy": -211.03070068359375, + "objective/kl": 29.673542022705078, + "objective/non_score_reward": -1.4836771488189697, + "objective/rlhf_reward": -5.934708833694458, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.046558856964111, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.59375, + "step": 1110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9997022151947021 + }, + { + "episode": 17792, + "epoch": 0.10660147871205858, + "loss/policy_avg": 0.6169148087501526, + "lr": 9.290005112474438e-06, + "objective/entropy": -85.4686279296875, + "objective/kl": 43.115867614746094, + "objective/non_score_reward": -2.1557934284210205, + "objective/rlhf_reward": -10.623173713684082, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.605442047119141, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.716796875, + "step": 1111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999053955078125 + }, + { + "episode": 17808, + "epoch": 0.10669734335118812, + "loss/policy_avg": -0.2326827198266983, + "lr": 9.289366053169735e-06, + "objective/entropy": -168.61294555664062, + "objective/kl": 37.52033996582031, + "objective/non_score_reward": -1.8760169744491577, + "objective/rlhf_reward": -6.144818389151974, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 3.934551954269409, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.642578125, + "step": 1112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0045270919799805 + }, + { + "episode": 17824, + "epoch": 0.10679320799031768, + "loss/policy_avg": 0.19226613640785217, + "lr": 9.288726993865032e-06, + "objective/entropy": -223.23049926757812, + "objective/kl": 23.595703125, + "objective/non_score_reward": -1.179785132408142, + "objective/rlhf_reward": -2.319140648841858, + "objective/scores": 0.6, + "policy/approxkl_avg": 16.47966766357422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65625, + "step": 1113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9992598295211792 + }, + { + "episode": 17840, + "epoch": 0.10688907262944722, + "loss/policy_avg": 0.03539525344967842, + "lr": 9.288087934560327e-06, + "objective/entropy": -108.81582641601562, + "objective/kl": 50.534141540527344, + "objective/non_score_reward": -2.526707172393799, + "objective/rlhf_reward": -10.106828570365906, + "objective/scores": 0.0, + "policy/approxkl_avg": 23.484683990478516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.712890625, + "step": 1114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996600866317749 + }, + { + "episode": 17856, + "epoch": 0.10698493726857677, + "loss/policy_avg": -0.0688924789428711, + "lr": 9.287448875255624e-06, + "objective/entropy": -133.3516387939453, + "objective/kl": 37.65576171875, + "objective/non_score_reward": -1.8827881813049316, + "objective/rlhf_reward": -4.607433949352476, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 71.75724792480469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.580078125, + "step": 1115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.00146222114563 + }, + { + "episode": 17872, + "epoch": 0.10708080190770632, + "loss/policy_avg": 0.5809643268585205, + "lr": 9.286809815950921e-06, + "objective/entropy": -207.24896240234375, + "objective/kl": 29.792011260986328, + "objective/non_score_reward": -1.4896005392074585, + "objective/rlhf_reward": -4.442630493434605, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 1.0145118236541748, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 1116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.006844997406006 + }, + { + "episode": 17888, + "epoch": 0.10717666654683587, + "loss/policy_avg": 1.1412543058395386, + "lr": 9.286170756646218e-06, + "objective/entropy": -92.31332397460938, + "objective/kl": 45.69993209838867, + "objective/non_score_reward": -2.284996747970581, + "objective/rlhf_reward": -9.139986753463745, + "objective/scores": 0.0, + "policy/approxkl_avg": 11.421480178833008, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.63671875, + "step": 1117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9972898960113525 + }, + { + "episode": 17904, + "epoch": 0.10727253118596541, + "loss/policy_avg": 0.05639980733394623, + "lr": 9.285531697341515e-06, + "objective/entropy": -212.18878173828125, + "objective/kl": 33.39814376831055, + "objective/non_score_reward": -1.6699072122573853, + "objective/rlhf_reward": -4.854799862178873, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 2.4437122344970703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 1118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9979337453842163 + }, + { + "episode": 17920, + "epoch": 0.10736839582509497, + "loss/policy_avg": 0.41871726512908936, + "lr": 9.284892638036812e-06, + "objective/entropy": -143.79598999023438, + "objective/kl": 38.5989990234375, + "objective/non_score_reward": -1.9299499988555908, + "objective/rlhf_reward": -7.719799995422363, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.838673114776611, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.720703125, + "step": 1119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0016021728515625 + }, + { + "episode": 17936, + "epoch": 0.10746426046422451, + "loss/policy_avg": 0.024607963860034943, + "lr": 9.284253578732107e-06, + "objective/entropy": -232.4127197265625, + "objective/kl": 37.280860900878906, + "objective/non_score_reward": -1.864043116569519, + "objective/rlhf_reward": -7.456172466278076, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5535926818847656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.787109375, + "step": 1120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.01005482673645 + }, + { + "episode": 17952, + "epoch": 0.10756012510335407, + "loss/policy_avg": 0.10696236789226532, + "lr": 9.283614519427404e-06, + "objective/entropy": -240.30230712890625, + "objective/kl": 37.33672332763672, + "objective/non_score_reward": -1.866836428642273, + "objective/rlhf_reward": -7.467345595359802, + "objective/scores": 0.0, + "policy/approxkl_avg": 26.991355895996094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.716796875, + "step": 1121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9982495307922363 + }, + { + "episode": 17968, + "epoch": 0.10765598974248361, + "loss/policy_avg": 0.03670099377632141, + "lr": 9.2829754601227e-06, + "objective/entropy": -197.23898315429688, + "objective/kl": 41.97047424316406, + "objective/non_score_reward": -2.0985236167907715, + "objective/rlhf_reward": -3.994094347953796, + "objective/scores": 1.1, + "policy/approxkl_avg": 36.509681701660156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.857421875, + "step": 1122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0017008781433105 + }, + { + "episode": 17984, + "epoch": 0.10775185438161317, + "loss/policy_avg": 0.32994019985198975, + "lr": 9.282336400817996e-06, + "objective/entropy": -188.03956604003906, + "objective/kl": 43.07110595703125, + "objective/non_score_reward": -2.153555393218994, + "objective/rlhf_reward": -4.214221572875976, + "objective/scores": 1.1, + "policy/approxkl_avg": 7.223627090454102, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.642578125, + "step": 1123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975366592407227 + }, + { + "episode": 18000, + "epoch": 0.10784771902074271, + "loss/policy_avg": 0.5365920066833496, + "lr": 9.281697341513293e-06, + "objective/entropy": -195.82501220703125, + "objective/kl": 34.87376403808594, + "objective/non_score_reward": -1.7436883449554443, + "objective/rlhf_reward": -5.027342031674321, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 30.888628005981445, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 1124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0017213821411133 + }, + { + "episode": 18016, + "epoch": 0.10794358365987226, + "loss/policy_avg": -0.12389272451400757, + "lr": 9.28105828220859e-06, + "objective/entropy": -214.11172485351562, + "objective/kl": 30.432300567626953, + "objective/non_score_reward": -1.5216151475906372, + "objective/rlhf_reward": -6.086460471153259, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5009217262268066, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.671875, + "step": 1125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.000213146209717 + }, + { + "episode": 18032, + "epoch": 0.1080394482990018, + "loss/policy_avg": -0.05387420952320099, + "lr": 9.280419222903886e-06, + "objective/entropy": -152.37127685546875, + "objective/kl": 31.959280014038086, + "objective/non_score_reward": -1.5979639291763306, + "objective/rlhf_reward": -3.9918555974960324, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.7455518245697021, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.611328125, + "step": 1126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001572847366333 + }, + { + "episode": 18048, + "epoch": 0.10813531293813136, + "loss/policy_avg": 0.24824705719947815, + "lr": 9.279780163599183e-06, + "objective/entropy": -152.4130859375, + "objective/kl": 41.757843017578125, + "objective/non_score_reward": -2.0878922939300537, + "objective/rlhf_reward": -8.351569056510925, + "objective/scores": 0.0, + "policy/approxkl_avg": 119.31018829345703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3984375, + "step": 1127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998676061630249 + }, + { + "episode": 18064, + "epoch": 0.1082311775772609, + "loss/policy_avg": 0.07473967224359512, + "lr": 9.279141104294478e-06, + "objective/entropy": -134.43746948242188, + "objective/kl": 45.83389663696289, + "objective/non_score_reward": -2.2916946411132812, + "objective/rlhf_reward": -11.166778564453125, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.130126953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625, + "step": 1128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9962289333343506 + }, + { + "episode": 18080, + "epoch": 0.10832704221639046, + "loss/policy_avg": 0.1333944946527481, + "lr": 9.278502044989775e-06, + "objective/entropy": -150.6534881591797, + "objective/kl": 32.044464111328125, + "objective/non_score_reward": -1.6022231578826904, + "objective/rlhf_reward": -8.408892631530762, + "objective/scores": -0.5, + "policy/approxkl_avg": 104.92411041259766, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.57421875, + "step": 1129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.999082088470459 + }, + { + "episode": 18096, + "epoch": 0.10842290685552, + "loss/policy_avg": 0.22492778301239014, + "lr": 9.277862985685072e-06, + "objective/entropy": -193.1295166015625, + "objective/kl": 31.847850799560547, + "objective/non_score_reward": -1.5923924446105957, + "objective/rlhf_reward": -4.7077106289273365, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 0.5217957496643066, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.564453125, + "step": 1130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0010805130004883 + }, + { + "episode": 18112, + "epoch": 0.10851877149464956, + "loss/policy_avg": 0.03467258810997009, + "lr": 9.277223926380369e-06, + "objective/entropy": -110.80841064453125, + "objective/kl": 31.04827117919922, + "objective/non_score_reward": -1.5524135828018188, + "objective/rlhf_reward": -6.209654331207275, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4131932258605957, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.57421875, + "step": 1131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0005240440368652 + }, + { + "episode": 18128, + "epoch": 0.1086146361337791, + "loss/policy_avg": 0.48854583501815796, + "lr": 9.276584867075666e-06, + "objective/entropy": -180.65689086914062, + "objective/kl": 43.2469482421875, + "objective/non_score_reward": -2.1623473167419434, + "objective/rlhf_reward": -7.323876593142671, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 136.09324645996094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.61328125, + "step": 1132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9991611242294312 + }, + { + "episode": 18144, + "epoch": 0.10871050077290866, + "loss/policy_avg": 0.14903083443641663, + "lr": 9.275945807770961e-06, + "objective/entropy": -186.40203857421875, + "objective/kl": 41.23994445800781, + "objective/non_score_reward": -2.061997413635254, + "objective/rlhf_reward": -10.247989654541016, + "objective/scores": -0.5, + "policy/approxkl_avg": 125.9528579711914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.609375, + "step": 1133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9962701797485352 + }, + { + "episode": 18160, + "epoch": 0.1088063654120382, + "loss/policy_avg": 0.4259245693683624, + "lr": 9.275306748466258e-06, + "objective/entropy": -175.52027893066406, + "objective/kl": 40.598670959472656, + "objective/non_score_reward": -2.029933452606201, + "objective/rlhf_reward": -10.119733810424805, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.297783851623535, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69140625, + "step": 1134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988666772842407 + }, + { + "episode": 18176, + "epoch": 0.10890223005116775, + "loss/policy_avg": 0.4467664361000061, + "lr": 9.274667689161555e-06, + "objective/entropy": -205.2392578125, + "objective/kl": 46.36846160888672, + "objective/non_score_reward": -2.318423271179199, + "objective/rlhf_reward": -11.273693084716797, + "objective/scores": -0.5, + "policy/approxkl_avg": 120.50224304199219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.583984375, + "step": 1135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9995296001434326 + }, + { + "episode": 18192, + "epoch": 0.1089980946902973, + "loss/policy_avg": 0.3303377032279968, + "lr": 9.274028629856852e-06, + "objective/entropy": -24.329784393310547, + "objective/kl": 46.25275421142578, + "objective/non_score_reward": -2.3126378059387207, + "objective/rlhf_reward": -7.1278451106706004, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 50.505615234375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.775390625, + "step": 1136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9970906972885132 + }, + { + "episode": 18208, + "epoch": 0.10909395932942685, + "loss/policy_avg": 1.4693771600723267, + "lr": 9.273389570552149e-06, + "objective/entropy": 10.024147033691406, + "objective/kl": 35.56517028808594, + "objective/non_score_reward": -1.7782583236694336, + "objective/rlhf_reward": -2.713033056259155, + "objective/scores": 1.1, + "policy/approxkl_avg": 7.3047943115234375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 1.0234375, + "step": 1137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9995927810668945 + }, + { + "episode": 18224, + "epoch": 0.1091898239685564, + "loss/policy_avg": 0.07471301406621933, + "lr": 9.272750511247446e-06, + "objective/entropy": -113.88967895507812, + "objective/kl": 37.872718811035156, + "objective/non_score_reward": -1.8936359882354736, + "objective/rlhf_reward": -3.1745441913604733, + "objective/scores": 1.1, + "policy/approxkl_avg": 40.957237243652344, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.69921875, + "step": 1138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9969100952148438 + }, + { + "episode": 18240, + "epoch": 0.10928568860768595, + "loss/policy_avg": 0.08686472475528717, + "lr": 9.27211145194274e-06, + "objective/entropy": -223.0821990966797, + "objective/kl": 26.48002815246582, + "objective/non_score_reward": -1.324001431465149, + "objective/rlhf_reward": -3.8960054874420162, + "objective/scores": 0.35, + "policy/approxkl_avg": 12.589103698730469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.615234375, + "step": 1139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002105236053467 + }, + { + "episode": 18256, + "epoch": 0.10938155324681549, + "loss/policy_avg": 0.003936432301998138, + "lr": 9.271472392638038e-06, + "objective/entropy": -209.745849609375, + "objective/kl": 47.734527587890625, + "objective/non_score_reward": -2.3867263793945312, + "objective/rlhf_reward": -5.146905398368835, + "objective/scores": 1.1, + "policy/approxkl_avg": 36.40452194213867, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.693359375, + "step": 1140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9992518424987793 + }, + { + "episode": 18272, + "epoch": 0.10947741788594505, + "loss/policy_avg": 0.06709360331296921, + "lr": 9.270833333333334e-06, + "objective/entropy": -151.42800903320312, + "objective/kl": 34.8001594543457, + "objective/non_score_reward": -1.740007996559143, + "objective/rlhf_reward": -6.960031867027283, + "objective/scores": 0.0, + "policy/approxkl_avg": 49.170501708984375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4951171875, + "step": 1141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0029659271240234 + }, + { + "episode": 18288, + "epoch": 0.10957328252507459, + "loss/policy_avg": 0.2170058935880661, + "lr": 9.270194274028631e-06, + "objective/entropy": -168.38040161132812, + "objective/kl": 37.98423385620117, + "objective/non_score_reward": -1.8992117643356323, + "objective/rlhf_reward": -7.596847057342529, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.969616174697876, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.681640625, + "step": 1142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000674247741699 + }, + { + "episode": 18304, + "epoch": 0.10966914716420414, + "loss/policy_avg": -0.12431719899177551, + "lr": 9.269555214723928e-06, + "objective/entropy": -113.78993225097656, + "objective/kl": 36.772281646728516, + "objective/non_score_reward": -1.8386142253875732, + "objective/rlhf_reward": -9.354456901550293, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.972061157226562, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.615234375, + "step": 1143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0068745613098145 + }, + { + "episode": 18320, + "epoch": 0.10976501180333369, + "loss/policy_avg": 0.23490570485591888, + "lr": 9.268916155419223e-06, + "objective/entropy": -69.30895233154297, + "objective/kl": 31.198192596435547, + "objective/non_score_reward": -1.5599095821380615, + "objective/rlhf_reward": -8.239639282226562, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.9234464168548584, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.650390625, + "step": 1144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000004768371582 + }, + { + "episode": 18336, + "epoch": 0.10986087644246324, + "loss/policy_avg": -0.39900097250938416, + "lr": 9.26827709611452e-06, + "objective/entropy": -128.88258361816406, + "objective/kl": 50.786354064941406, + "objective/non_score_reward": -2.539318084716797, + "objective/rlhf_reward": -8.778669455138546, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 1.8139411211013794, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6640625, + "step": 1145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0030529499053955 + }, + { + "episode": 18352, + "epoch": 0.10995674108159278, + "loss/policy_avg": 1.4107240438461304, + "lr": 9.267638036809816e-06, + "objective/entropy": -31.088985443115234, + "objective/kl": 44.32902145385742, + "objective/non_score_reward": -2.2164511680603027, + "objective/rlhf_reward": -8.865804195404053, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.591197967529297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.767578125, + "step": 1146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0015499591827393 + }, + { + "episode": 18368, + "epoch": 0.11005260572072234, + "loss/policy_avg": 0.6901419758796692, + "lr": 9.266998977505112e-06, + "objective/entropy": -117.8492202758789, + "objective/kl": 42.22416687011719, + "objective/non_score_reward": -2.111207962036133, + "objective/rlhf_reward": -8.444832682609558, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.155084609985352, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.720703125, + "step": 1147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996108055114746 + }, + { + "episode": 18384, + "epoch": 0.11014847035985188, + "loss/policy_avg": 0.28499191999435425, + "lr": 9.26635991820041e-06, + "objective/entropy": -155.38125610351562, + "objective/kl": 39.15877151489258, + "objective/non_score_reward": -1.957938551902771, + "objective/rlhf_reward": -9.831754684448242, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.332469940185547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.529296875, + "step": 1148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001255512237549 + }, + { + "episode": 18400, + "epoch": 0.11024433499898144, + "loss/policy_avg": 0.004523903131484985, + "lr": 9.265720858895706e-06, + "objective/entropy": -105.31055450439453, + "objective/kl": 35.68909454345703, + "objective/non_score_reward": -1.7844548225402832, + "objective/rlhf_reward": -9.137819290161133, + "objective/scores": -0.5, + "policy/approxkl_avg": 61.14020538330078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48046875, + "step": 1149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984228610992432 + }, + { + "episode": 18416, + "epoch": 0.11034019963811098, + "loss/policy_avg": 0.15958470106124878, + "lr": 9.265081799591003e-06, + "objective/entropy": -173.69564819335938, + "objective/kl": 32.33177947998047, + "objective/non_score_reward": -1.6165889501571655, + "objective/rlhf_reward": -8.46635627746582, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.45054817199707, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.650390625, + "step": 1150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998223781585693 + }, + { + "episode": 18432, + "epoch": 0.11043606427724054, + "loss/policy_avg": 0.09023189544677734, + "lr": 9.2644427402863e-06, + "objective/entropy": -142.7093048095703, + "objective/kl": 39.707950592041016, + "objective/non_score_reward": -1.9853975772857666, + "objective/rlhf_reward": -6.460637572224497, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 87.90142059326172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.607421875, + "step": 1151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9958066940307617 + }, + { + "episode": 18448, + "epoch": 0.11053192891637008, + "loss/policy_avg": -0.08936844766139984, + "lr": 9.263803680981595e-06, + "objective/entropy": -210.497802734375, + "objective/kl": 27.962690353393555, + "objective/non_score_reward": -1.3981345891952515, + "objective/rlhf_reward": -5.592538356781006, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.018726348876953, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7421875, + "step": 1152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.999576210975647 + }, + { + "episode": 18464, + "epoch": 0.11062779355549963, + "loss/policy_avg": 1.8017504215240479, + "lr": 9.263164621676892e-06, + "objective/entropy": -229.4371337890625, + "objective/kl": 34.96674346923828, + "objective/non_score_reward": -1.7483372688293457, + "objective/rlhf_reward": -2.593348717689514, + "objective/scores": 1.1, + "policy/approxkl_avg": 5.226605415344238, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.638671875, + "step": 1153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0037403106689453 + }, + { + "episode": 18480, + "epoch": 0.11072365819462919, + "loss/policy_avg": -0.42118197679519653, + "lr": 9.262525562372189e-06, + "objective/entropy": -205.19778442382812, + "objective/kl": 44.47303009033203, + "objective/non_score_reward": -2.223651647567749, + "objective/rlhf_reward": -8.894606471061707, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.3319854736328125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.580078125, + "step": 1154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9984722137451172 + }, + { + "episode": 18496, + "epoch": 0.11081952283375873, + "loss/policy_avg": 0.2763344943523407, + "lr": 9.261886503067486e-06, + "objective/entropy": -209.70533752441406, + "objective/kl": 38.5892333984375, + "objective/non_score_reward": -1.9294620752334595, + "objective/rlhf_reward": -6.317848420143127, + "objective/scores": 0.35, + "policy/approxkl_avg": 21.387535095214844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.689453125, + "step": 1155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9980080127716064 + }, + { + "episode": 18512, + "epoch": 0.11091538747288829, + "loss/policy_avg": 0.5002366304397583, + "lr": 9.261247443762783e-06, + "objective/entropy": -177.98944091796875, + "objective/kl": 34.55400848388672, + "objective/non_score_reward": -1.7277004718780518, + "objective/rlhf_reward": -6.910801649093628, + "objective/scores": 0.0, + "policy/approxkl_avg": 12.66330337524414, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5234375, + "step": 1156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.996732473373413 + }, + { + "episode": 18528, + "epoch": 0.11101125211201783, + "loss/policy_avg": -0.07930678129196167, + "lr": 9.260608384458078e-06, + "objective/entropy": -319.6336975097656, + "objective/kl": 33.24048614501953, + "objective/non_score_reward": -1.6620242595672607, + "objective/rlhf_reward": -6.648096799850464, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.9023439884185791, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.525390625, + "step": 1157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997968673706055 + }, + { + "episode": 18544, + "epoch": 0.11110711675114739, + "loss/policy_avg": 0.546525239944458, + "lr": 9.259969325153375e-06, + "objective/entropy": -204.97686767578125, + "objective/kl": 34.17107391357422, + "objective/non_score_reward": -1.7085537910461426, + "objective/rlhf_reward": -8.83421516418457, + "objective/scores": -0.5, + "policy/approxkl_avg": 60.44047164916992, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 1158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.998133659362793 + }, + { + "episode": 18560, + "epoch": 0.11120298139027693, + "loss/policy_avg": 0.08323957026004791, + "lr": 9.259330265848672e-06, + "objective/entropy": -194.127197265625, + "objective/kl": 43.20824432373047, + "objective/non_score_reward": -2.160412073135376, + "objective/rlhf_reward": -8.641648411750793, + "objective/scores": 0.0, + "policy/approxkl_avg": 98.96641540527344, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6328125, + "step": 1159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0033438205718994 + }, + { + "episode": 18576, + "epoch": 0.11129884602940648, + "loss/policy_avg": 0.1039925292134285, + "lr": 9.258691206543968e-06, + "objective/entropy": -78.0616455078125, + "objective/kl": 36.93494415283203, + "objective/non_score_reward": -1.8467472791671753, + "objective/rlhf_reward": -9.38698959350586, + "objective/scores": -0.5, + "policy/approxkl_avg": 83.22134399414062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.634765625, + "step": 1160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9986588954925537 + }, + { + "episode": 18592, + "epoch": 0.11139471066853603, + "loss/policy_avg": 0.7211666107177734, + "lr": 9.258052147239265e-06, + "objective/entropy": -185.3721160888672, + "objective/kl": 33.87718200683594, + "objective/non_score_reward": -1.693859338760376, + "objective/rlhf_reward": -8.775437355041504, + "objective/scores": -0.5, + "policy/approxkl_avg": 25.1632022857666, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.640625, + "step": 1161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997310638427734 + }, + { + "episode": 18608, + "epoch": 0.11149057530766558, + "loss/policy_avg": 0.4619596004486084, + "lr": 9.257413087934562e-06, + "objective/entropy": -264.2740173339844, + "objective/kl": 41.9027214050293, + "objective/non_score_reward": -2.0951361656188965, + "objective/rlhf_reward": -10.380544662475586, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.418521404266357, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.78515625, + "step": 1162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0029428005218506 + }, + { + "episode": 18624, + "epoch": 0.11158643994679512, + "loss/policy_avg": 1.5570926666259766, + "lr": 9.256774028629857e-06, + "objective/entropy": -167.19876098632812, + "objective/kl": 43.353755950927734, + "objective/non_score_reward": -2.1676878929138184, + "objective/rlhf_reward": -6.270751690864563, + "objective/scores": 0.6, + "policy/approxkl_avg": 100.45805358886719, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.541015625, + "step": 1163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.998075008392334 + }, + { + "episode": 18640, + "epoch": 0.11168230458592468, + "loss/policy_avg": 0.06055016070604324, + "lr": 9.256134969325154e-06, + "objective/entropy": -198.5944366455078, + "objective/kl": 41.007789611816406, + "objective/non_score_reward": -2.050389289855957, + "objective/rlhf_reward": -10.201557159423828, + "objective/scores": -0.5, + "policy/approxkl_avg": 66.64646911621094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.615234375, + "step": 1164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0016350746154785 + }, + { + "episode": 18656, + "epoch": 0.11177816922505422, + "loss/policy_avg": -0.025956686586141586, + "lr": 9.255495910020451e-06, + "objective/entropy": -215.60336303710938, + "objective/kl": 35.29871368408203, + "objective/non_score_reward": -1.7649357318878174, + "objective/rlhf_reward": -2.659742927551269, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.8881683349609375, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.736328125, + "step": 1165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0088062286376953 + }, + { + "episode": 18672, + "epoch": 0.11187403386418378, + "loss/policy_avg": 0.050327710807323456, + "lr": 9.254856850715748e-06, + "objective/entropy": -257.5550537109375, + "objective/kl": 34.92894744873047, + "objective/non_score_reward": -1.7464474439620972, + "objective/rlhf_reward": -5.585789775848388, + "objective/scores": 0.35, + "policy/approxkl_avg": 0.7891045808792114, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.640625, + "step": 1166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0020484924316406 + }, + { + "episode": 18688, + "epoch": 0.11196989850331332, + "loss/policy_avg": 0.28664839267730713, + "lr": 9.254217791411043e-06, + "objective/entropy": -244.4254150390625, + "objective/kl": 33.767906188964844, + "objective/non_score_reward": -1.6883955001831055, + "objective/rlhf_reward": -6.753581643104553, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.146383285522461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.765625, + "step": 1167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9985147714614868 + }, + { + "episode": 18704, + "epoch": 0.11206576314244288, + "loss/policy_avg": 0.08155789971351624, + "lr": 9.25357873210634e-06, + "objective/entropy": -135.68682861328125, + "objective/kl": 40.165077209472656, + "objective/non_score_reward": -2.008253812789917, + "objective/rlhf_reward": -8.033015489578247, + "objective/scores": 0.0, + "policy/approxkl_avg": 70.28575897216797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.68359375, + "step": 1168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0000505447387695 + }, + { + "episode": 18720, + "epoch": 0.11216162778157242, + "loss/policy_avg": 0.035900428891181946, + "lr": 9.252939672801637e-06, + "objective/entropy": -201.44180297851562, + "objective/kl": 35.17390441894531, + "objective/non_score_reward": -1.758695125579834, + "objective/rlhf_reward": -9.034780502319336, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.6578083038330078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 1169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.999299168586731 + }, + { + "episode": 18736, + "epoch": 0.11225749242070197, + "loss/policy_avg": 0.23309114575386047, + "lr": 9.252300613496932e-06, + "objective/entropy": -134.03857421875, + "objective/kl": 50.07221984863281, + "objective/non_score_reward": -2.5036110877990723, + "objective/rlhf_reward": -8.563846330256805, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 21.555416107177734, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.736328125, + "step": 1170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989700317382812 + }, + { + "episode": 18752, + "epoch": 0.11235335705983152, + "loss/policy_avg": -0.04200271517038345, + "lr": 9.251661554192229e-06, + "objective/entropy": -173.83047485351562, + "objective/kl": 30.594785690307617, + "objective/non_score_reward": -1.5297393798828125, + "objective/rlhf_reward": -6.118957281112671, + "objective/scores": 0.0, + "policy/approxkl_avg": 27.509376525878906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.61328125, + "step": 1171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9987194538116455 + }, + { + "episode": 18768, + "epoch": 0.11244922169896107, + "loss/policy_avg": 0.0603983998298645, + "lr": 9.251022494887526e-06, + "objective/entropy": -185.75567626953125, + "objective/kl": 38.586204528808594, + "objective/non_score_reward": -1.9293103218078613, + "objective/rlhf_reward": -9.717241287231445, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.716188669204712, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73828125, + "step": 1172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999668836593628 + }, + { + "episode": 18784, + "epoch": 0.11254508633809061, + "loss/policy_avg": -0.09884576499462128, + "lr": 9.250383435582823e-06, + "objective/entropy": -199.0475311279297, + "objective/kl": 33.882347106933594, + "objective/non_score_reward": -1.6941174268722534, + "objective/rlhf_reward": -8.776470184326172, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.142615795135498, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.595703125, + "step": 1173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999181866645813 + }, + { + "episode": 18800, + "epoch": 0.11264095097722017, + "loss/policy_avg": 0.18933314085006714, + "lr": 9.24974437627812e-06, + "objective/entropy": -154.91830444335938, + "objective/kl": 40.73468017578125, + "objective/non_score_reward": -2.036734104156494, + "objective/rlhf_reward": -8.146936416625977, + "objective/scores": 0.0, + "policy/approxkl_avg": 89.70398712158203, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6015625, + "step": 1174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001220703125 + }, + { + "episode": 18816, + "epoch": 0.11273681561634971, + "loss/policy_avg": 0.34834325313568115, + "lr": 9.249105316973417e-06, + "objective/entropy": -111.88278198242188, + "objective/kl": 46.60825729370117, + "objective/non_score_reward": -2.3304128646850586, + "objective/rlhf_reward": -7.805879676135715, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 116.61686706542969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.669921875, + "step": 1175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9976208209991455 + }, + { + "episode": 18832, + "epoch": 0.11283268025547927, + "loss/policy_avg": 0.09127620607614517, + "lr": 9.248466257668712e-06, + "objective/entropy": -34.16586685180664, + "objective/kl": 41.58169174194336, + "objective/non_score_reward": -2.079084634780884, + "objective/rlhf_reward": -8.316338539123535, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6012015342712402, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.650390625, + "step": 1176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.999786376953125 + }, + { + "episode": 18848, + "epoch": 0.11292854489460881, + "loss/policy_avg": -0.4199152886867523, + "lr": 9.247827198364009e-06, + "objective/entropy": -154.5921173095703, + "objective/kl": 40.367889404296875, + "objective/non_score_reward": -2.0183944702148438, + "objective/rlhf_reward": -10.073577880859375, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.02896785736084, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62890625, + "step": 1177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001009225845337 + }, + { + "episode": 18864, + "epoch": 0.11302440953373837, + "loss/policy_avg": 0.10369812697172165, + "lr": 9.247188139059305e-06, + "objective/entropy": -196.05679321289062, + "objective/kl": 43.01088333129883, + "objective/non_score_reward": -2.1505441665649414, + "objective/rlhf_reward": -10.602176666259766, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.217769622802734, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.611328125, + "step": 1178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0004961490631104 + }, + { + "episode": 18880, + "epoch": 0.11312027417286791, + "loss/policy_avg": 0.07693903893232346, + "lr": 9.246549079754602e-06, + "objective/entropy": -118.85835266113281, + "objective/kl": 29.787172317504883, + "objective/non_score_reward": -1.48935866355896, + "objective/rlhf_reward": -5.95743453502655, + "objective/scores": 0.0, + "policy/approxkl_avg": 32.888973236083984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.501953125, + "step": 1179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001927375793457 + }, + { + "episode": 18896, + "epoch": 0.11321613881199746, + "loss/policy_avg": -0.2185661941766739, + "lr": 9.2459100204499e-06, + "objective/entropy": -109.2579345703125, + "objective/kl": 46.237953186035156, + "objective/non_score_reward": -2.3118972778320312, + "objective/rlhf_reward": -9.247589468955994, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.752757549285889, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.51171875, + "step": 1180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995779991149902 + }, + { + "episode": 18912, + "epoch": 0.113312003451127, + "loss/policy_avg": 0.1016097217798233, + "lr": 9.245270961145194e-06, + "objective/entropy": -183.87237548828125, + "objective/kl": 42.37644958496094, + "objective/non_score_reward": -2.1188225746154785, + "objective/rlhf_reward": -10.475290298461914, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.645549774169922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.603515625, + "step": 1181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000525951385498 + }, + { + "episode": 18928, + "epoch": 0.11340786809025656, + "loss/policy_avg": 0.007417738437652588, + "lr": 9.244631901840491e-06, + "objective/entropy": -106.44618225097656, + "objective/kl": 50.69214630126953, + "objective/non_score_reward": -2.534607172012329, + "objective/rlhf_reward": -8.534308705393391, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 9.546636581420898, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.646484375, + "step": 1182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9999744892120361 + }, + { + "episode": 18944, + "epoch": 0.1135037327293861, + "loss/policy_avg": 0.2074076235294342, + "lr": 9.243992842535788e-06, + "objective/entropy": -169.82958984375, + "objective/kl": 35.57451248168945, + "objective/non_score_reward": -1.7787256240844727, + "objective/rlhf_reward": -4.191183601261351, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 24.878190994262695, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 1183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992552995681763 + }, + { + "episode": 18960, + "epoch": 0.11359959736851566, + "loss/policy_avg": 0.36138978600502014, + "lr": 9.243353783231085e-06, + "objective/entropy": -247.40234375, + "objective/kl": 37.66254425048828, + "objective/non_score_reward": -1.8831273317337036, + "objective/rlhf_reward": -6.108677227695551, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 26.46004295349121, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.623046875, + "step": 1184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9995391368865967 + }, + { + "episode": 18976, + "epoch": 0.1136954620076452, + "loss/policy_avg": 0.17048710584640503, + "lr": 9.242714723926382e-06, + "objective/entropy": -251.5931396484375, + "objective/kl": 55.048095703125, + "objective/non_score_reward": -2.7524046897888184, + "objective/rlhf_reward": -11.009619235992432, + "objective/scores": 0.0, + "policy/approxkl_avg": 104.20713806152344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.80859375, + "step": 1185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9988279342651367 + }, + { + "episode": 18992, + "epoch": 0.11379132664677476, + "loss/policy_avg": 0.4166252017021179, + "lr": 9.242075664621679e-06, + "objective/entropy": -141.91104125976562, + "objective/kl": 30.34893035888672, + "objective/non_score_reward": -1.517446517944336, + "objective/rlhf_reward": -4.122374962048466, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 36.105438232421875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.583984375, + "step": 1186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9952471256256104 + }, + { + "episode": 19008, + "epoch": 0.1138871912859043, + "loss/policy_avg": -0.09405485540628433, + "lr": 9.241436605316974e-06, + "objective/entropy": -215.87139892578125, + "objective/kl": 30.014829635620117, + "objective/non_score_reward": -1.5007414817810059, + "objective/rlhf_reward": -6.002965927124023, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.9436936378479, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.69140625, + "step": 1187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996919631958008 + }, + { + "episode": 19024, + "epoch": 0.11398305592503385, + "loss/policy_avg": 0.06526055932044983, + "lr": 9.240797546012271e-06, + "objective/entropy": -215.0313720703125, + "objective/kl": 28.091075897216797, + "objective/non_score_reward": -1.4045538902282715, + "objective/rlhf_reward": -3.495509090200935, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 9.912910461425781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.626953125, + "step": 1188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.997267723083496 + }, + { + "episode": 19040, + "epoch": 0.1140789205641634, + "loss/policy_avg": 0.2555314898490906, + "lr": 9.240158486707568e-06, + "objective/entropy": -66.75167846679688, + "objective/kl": 50.15288162231445, + "objective/non_score_reward": -2.5076441764831543, + "objective/rlhf_reward": -10.030576586723328, + "objective/scores": 0.0, + "policy/approxkl_avg": 33.272117614746094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.787109375, + "step": 1189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9952104091644287 + }, + { + "episode": 19056, + "epoch": 0.11417478520329295, + "loss/policy_avg": 0.09477699548006058, + "lr": 9.239519427402863e-06, + "objective/entropy": -204.3782196044922, + "objective/kl": 17.24319839477539, + "objective/non_score_reward": -0.8621599674224854, + "objective/rlhf_reward": -3.4486398100852966, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.499485015869141, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.744140625, + "step": 1190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991540908813477 + }, + { + "episode": 19072, + "epoch": 0.1142706498424225, + "loss/policy_avg": 0.27297595143318176, + "lr": 9.23888036809816e-06, + "objective/entropy": -228.1249542236328, + "objective/kl": 32.16904067993164, + "objective/non_score_reward": -1.6084520816802979, + "objective/rlhf_reward": -4.877548783031061, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 50.37181854248047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 1191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9970085620880127 + }, + { + "episode": 19088, + "epoch": 0.11436651448155205, + "loss/policy_avg": 0.18055689334869385, + "lr": 9.238241308793457e-06, + "objective/entropy": -219.4647216796875, + "objective/kl": 37.80063247680664, + "objective/non_score_reward": -1.8900315761566162, + "objective/rlhf_reward": -9.560127258300781, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.865171432495117, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.609375, + "step": 1192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9978690147399902 + }, + { + "episode": 19104, + "epoch": 0.11446237912068159, + "loss/policy_avg": 0.039097800850868225, + "lr": 9.237602249488754e-06, + "objective/entropy": -234.62734985351562, + "objective/kl": 30.65601348876953, + "objective/non_score_reward": -1.5328006744384766, + "objective/rlhf_reward": -8.131202697753906, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.4526290893554688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 1193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.997928261756897 + }, + { + "episode": 19120, + "epoch": 0.11455824375981115, + "loss/policy_avg": 0.16015778481960297, + "lr": 9.236963190184049e-06, + "objective/entropy": -145.63986206054688, + "objective/kl": 35.36297607421875, + "objective/non_score_reward": -1.7681488990783691, + "objective/rlhf_reward": -7.072595238685608, + "objective/scores": 0.0, + "policy/approxkl_avg": 12.726469039916992, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.63671875, + "step": 1194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989216327667236 + }, + { + "episode": 19136, + "epoch": 0.11465410839894069, + "loss/policy_avg": 0.3464363217353821, + "lr": 9.236324130879346e-06, + "objective/entropy": -26.99747085571289, + "objective/kl": 35.06488037109375, + "objective/non_score_reward": -1.7532438039779663, + "objective/rlhf_reward": -9.012975692749023, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.075830459594727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.685546875, + "step": 1195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998866081237793 + }, + { + "episode": 19152, + "epoch": 0.11474997303807025, + "loss/policy_avg": 0.22321917116641998, + "lr": 9.235685071574642e-06, + "objective/entropy": -137.14828491210938, + "objective/kl": 42.01813888549805, + "objective/non_score_reward": -2.1009068489074707, + "objective/rlhf_reward": -6.456216524319585, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 62.788055419921875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.740234375, + "step": 1196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9967682361602783 + }, + { + "episode": 19168, + "epoch": 0.11484583767719979, + "loss/policy_avg": 0.5135493874549866, + "lr": 9.23504601226994e-06, + "objective/entropy": -105.36206817626953, + "objective/kl": 44.23863220214844, + "objective/non_score_reward": -2.2119314670562744, + "objective/rlhf_reward": -7.447725868225097, + "objective/scores": 0.35, + "policy/approxkl_avg": 23.04653549194336, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.591796875, + "step": 1197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9975529909133911 + }, + { + "episode": 19184, + "epoch": 0.11494170231632934, + "loss/policy_avg": 0.524350643157959, + "lr": 9.234406952965236e-06, + "objective/entropy": -241.46133422851562, + "objective/kl": 55.43983459472656, + "objective/non_score_reward": -2.771991729736328, + "objective/rlhf_reward": -9.70936403521667, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 8.057157516479492, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 1198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.998246192932129 + }, + { + "episode": 19200, + "epoch": 0.11503756695545889, + "loss/policy_avg": 0.05571490526199341, + "lr": 9.233767893660533e-06, + "objective/entropy": -106.03378295898438, + "objective/kl": 38.72266387939453, + "objective/non_score_reward": -1.9361332654953003, + "objective/rlhf_reward": -7.744532823562622, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.442030906677246, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.611328125, + "step": 1199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9999315738677979 + }, + { + "episode": 19216, + "epoch": 0.11513343159458844, + "loss/policy_avg": 1.25569486618042, + "lr": 9.233128834355828e-06, + "objective/entropy": -206.72470092773438, + "objective/kl": 38.810768127441406, + "objective/non_score_reward": -1.9405385255813599, + "objective/rlhf_reward": -9.762153625488281, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.192720413208008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 1200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0066146850585938 + }, + { + "episode": 19232, + "epoch": 0.11522929623371798, + "loss/policy_avg": -0.046953827142715454, + "lr": 9.232489775051125e-06, + "objective/entropy": -277.1523742675781, + "objective/kl": 29.762226104736328, + "objective/non_score_reward": -1.4881114959716797, + "objective/rlhf_reward": -5.952445864677429, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.098554611206055, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.689453125, + "step": 1201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0129566192626953 + }, + { + "episode": 19248, + "epoch": 0.11532516087284754, + "loss/policy_avg": -0.15201960504055023, + "lr": 9.231850715746422e-06, + "objective/entropy": -155.02728271484375, + "objective/kl": 31.08127784729004, + "objective/non_score_reward": -1.5540640354156494, + "objective/rlhf_reward": -8.216256141662598, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.0597944259643555, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.595703125, + "step": 1202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0001163482666016 + }, + { + "episode": 19264, + "epoch": 0.11542102551197708, + "loss/policy_avg": 0.6680903434753418, + "lr": 9.231211656441719e-06, + "objective/entropy": -114.69148254394531, + "objective/kl": 45.13048553466797, + "objective/non_score_reward": -2.2565245628356934, + "objective/rlhf_reward": -9.026098251342773, + "objective/scores": 0.0, + "policy/approxkl_avg": 14.719629287719727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.568359375, + "step": 1203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990174770355225 + }, + { + "episode": 19280, + "epoch": 0.11551689015110664, + "loss/policy_avg": 0.2732774317264557, + "lr": 9.230572597137016e-06, + "objective/entropy": -211.10403442382812, + "objective/kl": 39.7333984375, + "objective/non_score_reward": -1.9866702556610107, + "objective/rlhf_reward": -9.946681022644043, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.76051712036133, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.724609375, + "step": 1204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998841404914856 + }, + { + "episode": 19296, + "epoch": 0.11561275479023618, + "loss/policy_avg": 0.006262578070163727, + "lr": 9.229933537832311e-06, + "objective/entropy": -137.3421630859375, + "objective/kl": 40.68704605102539, + "objective/non_score_reward": -2.0343523025512695, + "objective/rlhf_reward": -6.758807041732174, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 5.212194442749023, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 1205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0027616024017334 + }, + { + "episode": 19312, + "epoch": 0.11570861942936574, + "loss/policy_avg": 0.37284693121910095, + "lr": 9.229294478527608e-06, + "objective/entropy": -207.12786865234375, + "objective/kl": 37.499149322509766, + "objective/non_score_reward": -1.8749574422836304, + "objective/rlhf_reward": -9.49983024597168, + "objective/scores": -0.5, + "policy/approxkl_avg": 24.222797393798828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.63671875, + "step": 1206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9978138208389282 + }, + { + "episode": 19328, + "epoch": 0.11580448406849528, + "loss/policy_avg": 3.9033591747283936e-05, + "lr": 9.228655419222905e-06, + "objective/entropy": -241.87142944335938, + "objective/kl": 44.25556945800781, + "objective/non_score_reward": -2.2127785682678223, + "objective/rlhf_reward": -10.851114273071289, + "objective/scores": -0.5, + "policy/approxkl_avg": 90.05424499511719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 1207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000077247619629 + }, + { + "episode": 19344, + "epoch": 0.11590034870762483, + "loss/policy_avg": 0.04275323450565338, + "lr": 9.228016359918202e-06, + "objective/entropy": -212.75177001953125, + "objective/kl": 35.541194915771484, + "objective/non_score_reward": -1.77705979347229, + "objective/rlhf_reward": -7.108238935470581, + "objective/scores": 0.0, + "policy/approxkl_avg": 12.960420608520508, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.689453125, + "step": 1208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0036187171936035 + }, + { + "episode": 19360, + "epoch": 0.11599621334675438, + "loss/policy_avg": -0.3366244435310364, + "lr": 9.227377300613499e-06, + "objective/entropy": -174.33404541015625, + "objective/kl": 40.26109313964844, + "objective/non_score_reward": -2.013054609298706, + "objective/rlhf_reward": -5.929512085691963, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 16.313072204589844, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5078125, + "step": 1209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0050065517425537 + }, + { + "episode": 19376, + "epoch": 0.11609207798588393, + "loss/policy_avg": 0.27779069542884827, + "lr": 9.226738241308795e-06, + "objective/entropy": -205.51553344726562, + "objective/kl": 31.215747833251953, + "objective/non_score_reward": -1.560787320137024, + "objective/rlhf_reward": -8.243148803710938, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.01480484008789, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.591796875, + "step": 1210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.998305320739746 + }, + { + "episode": 19392, + "epoch": 0.11618794262501349, + "loss/policy_avg": 0.5489068031311035, + "lr": 9.22609918200409e-06, + "objective/entropy": -107.10516357421875, + "objective/kl": 35.4151725769043, + "objective/non_score_reward": -1.7707587480545044, + "objective/rlhf_reward": -5.659202892978755, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 17.363006591796875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.52734375, + "step": 1211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.998681664466858 + }, + { + "episode": 19408, + "epoch": 0.11628380726414303, + "loss/policy_avg": 0.09689401090145111, + "lr": 9.225460122699387e-06, + "objective/entropy": -116.89547729492188, + "objective/kl": 33.24856948852539, + "objective/non_score_reward": -1.662428379058838, + "objective/rlhf_reward": -8.649714469909668, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.7079713344573975, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 1212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9992122650146484 + }, + { + "episode": 19424, + "epoch": 0.11637967190327259, + "loss/policy_avg": -0.005663630552589893, + "lr": 9.224821063394683e-06, + "objective/entropy": -224.07037353515625, + "objective/kl": 36.07778549194336, + "objective/non_score_reward": -1.803889274597168, + "objective/rlhf_reward": -9.215557098388672, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.525299072265625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.650390625, + "step": 1213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989194869995117 + }, + { + "episode": 19440, + "epoch": 0.11647553654240213, + "loss/policy_avg": 1.2613228559494019, + "lr": 9.22418200408998e-06, + "objective/entropy": -110.88761901855469, + "objective/kl": 42.118072509765625, + "objective/non_score_reward": -2.1059038639068604, + "objective/rlhf_reward": -8.423615217208862, + "objective/scores": 0.0, + "policy/approxkl_avg": 50.087554931640625, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5625, + "step": 1214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994966983795166 + }, + { + "episode": 19456, + "epoch": 0.11657140118153168, + "loss/policy_avg": 1.7253010272979736, + "lr": 9.223542944785276e-06, + "objective/entropy": -195.07435607910156, + "objective/kl": 49.77064895629883, + "objective/non_score_reward": -2.488532543182373, + "objective/rlhf_reward": -11.954130172729492, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.2726173400878906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.724609375, + "step": 1215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002929449081421 + }, + { + "episode": 19472, + "epoch": 0.11666726582066123, + "loss/policy_avg": 0.4091072082519531, + "lr": 9.222903885480573e-06, + "objective/entropy": -170.8401641845703, + "objective/kl": 47.975318908691406, + "objective/non_score_reward": -2.398765802383423, + "objective/rlhf_reward": -9.595062971115112, + "objective/scores": 0.0, + "policy/approxkl_avg": 31.722942352294922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.60546875, + "step": 1216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9983785152435303 + }, + { + "episode": 19488, + "epoch": 0.11676313045979078, + "loss/policy_avg": 0.06230299919843674, + "lr": 9.22226482617587e-06, + "objective/entropy": -190.013671875, + "objective/kl": 28.717660903930664, + "objective/non_score_reward": -1.4358831644058228, + "objective/rlhf_reward": -7.743532657623291, + "objective/scores": -0.5, + "policy/approxkl_avg": 24.035560607910156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 1217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.000926971435547 + }, + { + "episode": 19504, + "epoch": 0.11685899509892032, + "loss/policy_avg": 0.6068574786186218, + "lr": 9.221625766871165e-06, + "objective/entropy": -276.9735107421875, + "objective/kl": 43.07137680053711, + "objective/non_score_reward": -2.153568744659424, + "objective/rlhf_reward": -10.614274978637695, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.093493938446045, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.626953125, + "step": 1218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.004453182220459 + }, + { + "episode": 19520, + "epoch": 0.11695485973804988, + "loss/policy_avg": 0.15249356627464294, + "lr": 9.220986707566462e-06, + "objective/entropy": -56.790042877197266, + "objective/kl": 36.03568649291992, + "objective/non_score_reward": -1.8017845153808594, + "objective/rlhf_reward": -5.828535773841244, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 95.31053161621094, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.69140625, + "step": 1219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0025434494018555 + }, + { + "episode": 19536, + "epoch": 0.11705072437717942, + "loss/policy_avg": 0.14016011357307434, + "lr": 9.220347648261759e-06, + "objective/entropy": -223.62423706054688, + "objective/kl": 27.627132415771484, + "objective/non_score_reward": -1.3813567161560059, + "objective/rlhf_reward": -4.214745646318794, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 6.043001174926758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65625, + "step": 1220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.998063087463379 + }, + { + "episode": 19552, + "epoch": 0.11714658901630898, + "loss/policy_avg": 0.1201862320303917, + "lr": 9.219708588957056e-06, + "objective/entropy": -161.87193298339844, + "objective/kl": 32.587364196777344, + "objective/non_score_reward": -1.6293680667877197, + "objective/rlhf_reward": -8.517471313476562, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.10272216796875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 1221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999218463897705 + }, + { + "episode": 19568, + "epoch": 0.11724245365543852, + "loss/policy_avg": 0.17292506992816925, + "lr": 9.219069529652353e-06, + "objective/entropy": -156.0726776123047, + "objective/kl": 30.819015502929688, + "objective/non_score_reward": -1.5409507751464844, + "objective/rlhf_reward": -6.163802862167358, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.231681823730469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646484375, + "step": 1222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9975740909576416 + }, + { + "episode": 19584, + "epoch": 0.11733831829456808, + "loss/policy_avg": -0.10841546952724457, + "lr": 9.21843047034765e-06, + "objective/entropy": -141.7899627685547, + "objective/kl": 35.38237762451172, + "objective/non_score_reward": -1.7691190242767334, + "objective/rlhf_reward": -5.697873451796871, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 12.906152725219727, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.71484375, + "step": 1223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000526189804077 + }, + { + "episode": 19600, + "epoch": 0.11743418293369762, + "loss/policy_avg": 0.3013886511325836, + "lr": 9.217791411042945e-06, + "objective/entropy": -159.78964233398438, + "objective/kl": 41.615631103515625, + "objective/non_score_reward": -2.0807814598083496, + "objective/rlhf_reward": -8.323126316070557, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8738489151000977, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.791015625, + "step": 1224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988994598388672 + }, + { + "episode": 19616, + "epoch": 0.11753004757282717, + "loss/policy_avg": 0.06031399220228195, + "lr": 9.217152351738242e-06, + "objective/entropy": -219.4001922607422, + "objective/kl": 32.87148666381836, + "objective/non_score_reward": -1.6435742378234863, + "objective/rlhf_reward": -8.574296951293945, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.2064297199249268, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7265625, + "step": 1225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.002220630645752 + }, + { + "episode": 19632, + "epoch": 0.11762591221195672, + "loss/policy_avg": 1.2073478698730469, + "lr": 9.216513292433539e-06, + "objective/entropy": -216.6431884765625, + "objective/kl": 40.24296569824219, + "objective/non_score_reward": -2.012148380279541, + "objective/rlhf_reward": -10.048593521118164, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.075427532196045, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6328125, + "step": 1226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0001580715179443 + }, + { + "episode": 19648, + "epoch": 0.11772177685108627, + "loss/policy_avg": 0.13787195086479187, + "lr": 9.215874233128836e-06, + "objective/entropy": -230.93502807617188, + "objective/kl": 42.792022705078125, + "objective/non_score_reward": -2.139601230621338, + "objective/rlhf_reward": -8.558404684066772, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.827735900878906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65234375, + "step": 1227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979270696640015 + }, + { + "episode": 19664, + "epoch": 0.11781764149021581, + "loss/policy_avg": 0.7251629829406738, + "lr": 9.215235173824132e-06, + "objective/entropy": -183.5417022705078, + "objective/kl": 39.72979736328125, + "objective/non_score_reward": -1.9864901304244995, + "objective/rlhf_reward": -6.4301886198841895, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 3.1887898445129395, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.58203125, + "step": 1228, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989159107208252 + }, + { + "episode": 19680, + "epoch": 0.11791350612934537, + "loss/policy_avg": 0.4710264801979065, + "lr": 9.21459611451943e-06, + "objective/entropy": -126.37023162841797, + "objective/kl": 44.64482116699219, + "objective/non_score_reward": -2.232241153717041, + "objective/rlhf_reward": -8.928964257240295, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.475590705871582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.77734375, + "step": 1229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999788522720337 + }, + { + "episode": 19696, + "epoch": 0.11800937076847491, + "loss/policy_avg": 0.07359868288040161, + "lr": 9.213957055214725e-06, + "objective/entropy": -165.31097412109375, + "objective/kl": 25.95929527282715, + "objective/non_score_reward": -1.2979648113250732, + "objective/rlhf_reward": -0.7918591260910031, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.2322168350219727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 1230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000141143798828 + }, + { + "episode": 19712, + "epoch": 0.11810523540760447, + "loss/policy_avg": 1.2666089534759521, + "lr": 9.213317995910021e-06, + "objective/entropy": -192.01089477539062, + "objective/kl": 41.55731964111328, + "objective/non_score_reward": -2.0778658390045166, + "objective/rlhf_reward": -8.311463594436646, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8330421447753906, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.59765625, + "step": 1231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000436305999756 + }, + { + "episode": 19728, + "epoch": 0.11820110004673401, + "loss/policy_avg": 0.11511238664388657, + "lr": 9.212678936605318e-06, + "objective/entropy": -236.70005798339844, + "objective/kl": 39.305538177490234, + "objective/non_score_reward": -1.9652769565582275, + "objective/rlhf_reward": -7.86110782623291, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.043410301208496, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66015625, + "step": 1232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0033040046691895 + }, + { + "episode": 19744, + "epoch": 0.11829696468586356, + "loss/policy_avg": 0.146561399102211, + "lr": 9.212039877300615e-06, + "objective/entropy": -211.19525146484375, + "objective/kl": 35.35223388671875, + "objective/non_score_reward": -1.7676118612289429, + "objective/rlhf_reward": -4.1467285498392314, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.833638906478882, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.662109375, + "step": 1233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.997934341430664 + }, + { + "episode": 19760, + "epoch": 0.1183928293249931, + "loss/policy_avg": 0.26507917046546936, + "lr": 9.21140081799591e-06, + "objective/entropy": -241.573974609375, + "objective/kl": 33.000160217285156, + "objective/non_score_reward": -1.650007963180542, + "objective/rlhf_reward": -6.6000319719314575, + "objective/scores": 0.0, + "policy/approxkl_avg": 46.60485076904297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.787109375, + "step": 1234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0001449584960938 + }, + { + "episode": 19776, + "epoch": 0.11848869396412266, + "loss/policy_avg": 0.0665254145860672, + "lr": 9.210761758691207e-06, + "objective/entropy": -133.48040771484375, + "objective/kl": 35.46574401855469, + "objective/non_score_reward": -1.773287057876587, + "objective/rlhf_reward": -5.3598151365915925, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 20.336368560791016, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.57421875, + "step": 1235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9977645874023438 + }, + { + "episode": 19792, + "epoch": 0.1185845586032522, + "loss/policy_avg": 0.36719992756843567, + "lr": 9.210122699386504e-06, + "objective/entropy": -117.32669830322266, + "objective/kl": 48.35634994506836, + "objective/non_score_reward": -2.4178178310394287, + "objective/rlhf_reward": -11.671271324157715, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.851215362548828, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.55859375, + "step": 1236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9996728897094727 + }, + { + "episode": 19808, + "epoch": 0.11868042324238176, + "loss/policy_avg": 0.4682718515396118, + "lr": 9.2094836400818e-06, + "objective/entropy": -206.222412109375, + "objective/kl": 45.98625564575195, + "objective/non_score_reward": -2.2993125915527344, + "objective/rlhf_reward": -9.19725090265274, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.436807632446289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.552734375, + "step": 1237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0007567405700684 + }, + { + "episode": 19824, + "epoch": 0.1187762878815113, + "loss/policy_avg": -0.01652948558330536, + "lr": 9.208844580777096e-06, + "objective/entropy": -188.5276641845703, + "objective/kl": 32.837188720703125, + "objective/non_score_reward": -1.641859531402588, + "objective/rlhf_reward": -6.5674378871917725, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.563506126403809, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.640625, + "step": 1238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9983434677124023 + }, + { + "episode": 19840, + "epoch": 0.11887215252064086, + "loss/policy_avg": 0.09944380819797516, + "lr": 9.208205521472393e-06, + "objective/entropy": -136.99464416503906, + "objective/kl": 37.38746643066406, + "objective/non_score_reward": -1.8693733215332031, + "objective/rlhf_reward": -9.477493286132812, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.7434344291687012, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6484375, + "step": 1239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0000529289245605 + }, + { + "episode": 19856, + "epoch": 0.1189680171597704, + "loss/policy_avg": 0.2101641595363617, + "lr": 9.20756646216769e-06, + "objective/entropy": -194.30543518066406, + "objective/kl": 31.27606201171875, + "objective/non_score_reward": -1.5638031959533691, + "objective/rlhf_reward": -6.255212783813477, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.194089412689209, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69921875, + "step": 1240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0002334117889404 + }, + { + "episode": 19872, + "epoch": 0.11906388179889996, + "loss/policy_avg": 0.3749880790710449, + "lr": 9.206927402862987e-06, + "objective/entropy": -190.45925903320312, + "objective/kl": 46.663055419921875, + "objective/non_score_reward": -2.3331527709960938, + "objective/rlhf_reward": -9.332611322402954, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.1509120464324951, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.68359375, + "step": 1241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001755714416504 + }, + { + "episode": 19888, + "epoch": 0.1191597464380295, + "loss/policy_avg": 1.915224313735962, + "lr": 9.206288343558284e-06, + "objective/entropy": -157.99041748046875, + "objective/kl": 43.092308044433594, + "objective/non_score_reward": -2.1546154022216797, + "objective/rlhf_reward": -10.618461608886719, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.9945058822631836, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.630859375, + "step": 1242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.003427028656006 + }, + { + "episode": 19904, + "epoch": 0.11925561107715905, + "loss/policy_avg": 0.21879303455352783, + "lr": 9.205649284253579e-06, + "objective/entropy": -219.91407775878906, + "objective/kl": 33.03980255126953, + "objective/non_score_reward": -1.6519904136657715, + "objective/rlhf_reward": -5.1573631568864435, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 23.242431640625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.76171875, + "step": 1243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9982752799987793 + }, + { + "episode": 19920, + "epoch": 0.1193514757162886, + "loss/policy_avg": 0.42668187618255615, + "lr": 9.205010224948876e-06, + "objective/entropy": -144.3309326171875, + "objective/kl": 39.1207275390625, + "objective/non_score_reward": -1.9560365676879883, + "objective/rlhf_reward": -6.44554398306976, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 16.93752670288086, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.73828125, + "step": 1244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9974477291107178 + }, + { + "episode": 19936, + "epoch": 0.11944734035541815, + "loss/policy_avg": 0.2975335121154785, + "lr": 9.204371165644173e-06, + "objective/entropy": -254.49005126953125, + "objective/kl": 37.68057632446289, + "objective/non_score_reward": -1.884028673171997, + "objective/rlhf_reward": -5.979855745044306, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 16.638486862182617, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8046875, + "step": 1245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0001516342163086 + }, + { + "episode": 19952, + "epoch": 0.1195432049945477, + "loss/policy_avg": 0.10268110036849976, + "lr": 9.20373210633947e-06, + "objective/entropy": -0.897064208984375, + "objective/kl": 33.41365051269531, + "objective/non_score_reward": -1.670682430267334, + "objective/rlhf_reward": -8.682729721069336, + "objective/scores": -0.5, + "policy/approxkl_avg": 32.68716812133789, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.802734375, + "step": 1246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000943183898926 + }, + { + "episode": 19968, + "epoch": 0.11963906963367725, + "loss/policy_avg": 0.5944703817367554, + "lr": 9.203093047034766e-06, + "objective/entropy": -125.2044906616211, + "objective/kl": 29.878034591674805, + "objective/non_score_reward": -1.4939017295837402, + "objective/rlhf_reward": -7.975606918334961, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.386956214904785, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.740234375, + "step": 1247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9996213912963867 + }, + { + "episode": 19984, + "epoch": 0.11973493427280679, + "loss/policy_avg": 0.26909464597702026, + "lr": 9.202453987730062e-06, + "objective/entropy": -165.02374267578125, + "objective/kl": 41.21909713745117, + "objective/non_score_reward": -2.0609548091888428, + "objective/rlhf_reward": -6.793221096606597, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 5.399163722991943, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 1248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985885620117188 + }, + { + "episode": 20000, + "epoch": 0.11983079891193635, + "loss/policy_avg": 0.06403729319572449, + "lr": 9.201814928425358e-06, + "objective/entropy": -213.74514770507812, + "objective/kl": 43.81724548339844, + "objective/non_score_reward": -2.1908626556396484, + "objective/rlhf_reward": -4.36345020532608, + "objective/scores": 1.1, + "policy/approxkl_avg": 58.358123779296875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 1249, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9976136684417725 + }, + { + "episode": 20016, + "epoch": 0.11992666355106589, + "loss/policy_avg": -0.04170785844326019, + "lr": 9.201175869120655e-06, + "objective/entropy": -59.382179260253906, + "objective/kl": 42.117557525634766, + "objective/non_score_reward": -2.1058778762817383, + "objective/rlhf_reward": -8.423511981964111, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.038078308105469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.73828125, + "step": 1250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989477396011353 + }, + { + "episode": 20032, + "epoch": 0.12002252819019545, + "loss/policy_avg": 0.2711395025253296, + "lr": 9.200536809815952e-06, + "objective/entropy": -195.36456298828125, + "objective/kl": 32.941802978515625, + "objective/non_score_reward": -1.6470900774002075, + "objective/rlhf_reward": -5.229110562537594, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 22.37289810180664, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.728515625, + "step": 1251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.998055338859558 + }, + { + "episode": 20048, + "epoch": 0.12011839282932499, + "loss/policy_avg": 3.912698745727539, + "lr": 9.199897750511249e-06, + "objective/entropy": -199.9739990234375, + "objective/kl": 40.26013946533203, + "objective/non_score_reward": -2.013007164001465, + "objective/rlhf_reward": -10.05202865600586, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.0693812370300293, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.73828125, + "step": 1252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000178813934326 + }, + { + "episode": 20064, + "epoch": 0.12021425746845454, + "loss/policy_avg": 0.013571079820394516, + "lr": 9.199258691206546e-06, + "objective/entropy": -123.17973327636719, + "objective/kl": 42.9838752746582, + "objective/non_score_reward": -2.14919376373291, + "objective/rlhf_reward": -6.992654714647847, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 1.2502174377441406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.65234375, + "step": 1253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000396728515625 + }, + { + "episode": 20080, + "epoch": 0.12031012210758409, + "loss/policy_avg": 0.4384981393814087, + "lr": 9.198619631901841e-06, + "objective/entropy": -193.27044677734375, + "objective/kl": 39.18042755126953, + "objective/non_score_reward": -1.9590213298797607, + "objective/rlhf_reward": -7.836085557937622, + "objective/scores": 0.0, + "policy/approxkl_avg": 19.84510040283203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.681640625, + "step": 1254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9983214139938354 + }, + { + "episode": 20096, + "epoch": 0.12040598674671364, + "loss/policy_avg": -0.2049793154001236, + "lr": 9.197980572597138e-06, + "objective/entropy": -103.8299560546875, + "objective/kl": 37.63689422607422, + "objective/non_score_reward": -1.8818447589874268, + "objective/rlhf_reward": -6.103546936710444, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 8.113576889038086, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5625, + "step": 1255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.002764940261841 + }, + { + "episode": 20112, + "epoch": 0.12050185138584318, + "loss/policy_avg": -0.055029574781656265, + "lr": 9.197341513292433e-06, + "objective/entropy": -130.7873077392578, + "objective/kl": 23.971694946289062, + "objective/non_score_reward": -1.198584794998169, + "objective/rlhf_reward": -1.8706199272882666, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.777101039886475, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.76953125, + "step": 1256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989140033721924 + }, + { + "episode": 20128, + "epoch": 0.12059771602497274, + "loss/policy_avg": -0.03776903823018074, + "lr": 9.19670245398773e-06, + "objective/entropy": -91.43289947509766, + "objective/kl": 38.78535461425781, + "objective/non_score_reward": -1.939267873764038, + "objective/rlhf_reward": -6.276119115765452, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 2.1053802967071533, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.61328125, + "step": 1257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0026392936706543 + }, + { + "episode": 20144, + "epoch": 0.12069358066410228, + "loss/policy_avg": 2.407710313796997, + "lr": 9.196063394683027e-06, + "objective/entropy": -134.95916748046875, + "objective/kl": 42.69334411621094, + "objective/non_score_reward": -2.134667158126831, + "objective/rlhf_reward": -10.538668632507324, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.5845794677734375, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.716796875, + "step": 1258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004944801330566 + }, + { + "episode": 20160, + "epoch": 0.12078944530323184, + "loss/policy_avg": -0.23540058732032776, + "lr": 9.195424335378324e-06, + "objective/entropy": -97.24913024902344, + "objective/kl": 46.6688232421875, + "objective/non_score_reward": -2.3334410190582275, + "objective/rlhf_reward": -9.3337641954422, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.759963035583496, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.650390625, + "step": 1259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992964267730713 + }, + { + "episode": 20176, + "epoch": 0.12088530994236138, + "loss/policy_avg": 0.04770183563232422, + "lr": 9.19478527607362e-06, + "objective/entropy": -211.20736694335938, + "objective/kl": 47.27978515625, + "objective/non_score_reward": -2.3639893531799316, + "objective/rlhf_reward": -9.455957651138306, + "objective/scores": 0.0, + "policy/approxkl_avg": 53.94694519042969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6640625, + "step": 1260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9971760511398315 + }, + { + "episode": 20192, + "epoch": 0.12098117458149094, + "loss/policy_avg": 0.1092609241604805, + "lr": 9.194146216768916e-06, + "objective/entropy": -207.302001953125, + "objective/kl": 52.52091598510742, + "objective/non_score_reward": -2.6260457038879395, + "objective/rlhf_reward": -9.053584675402984, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 2.0765137672424316, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.693359375, + "step": 1261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0045700073242188 + }, + { + "episode": 20208, + "epoch": 0.12107703922062048, + "loss/policy_avg": 0.14769725501537323, + "lr": 9.193507157464213e-06, + "objective/entropy": -275.3343505859375, + "objective/kl": 44.57439041137695, + "objective/non_score_reward": -2.228719472885132, + "objective/rlhf_reward": -7.253018384397613, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 32.329620361328125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.662109375, + "step": 1262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9996920824050903 + }, + { + "episode": 20224, + "epoch": 0.12117290385975003, + "loss/policy_avg": 0.7402910590171814, + "lr": 9.19286809815951e-06, + "objective/entropy": -220.82089233398438, + "objective/kl": 31.622211456298828, + "objective/non_score_reward": -1.5811107158660889, + "objective/rlhf_reward": -6.324442744255066, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.604763984680176, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69921875, + "step": 1263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9981896877288818 + }, + { + "episode": 20240, + "epoch": 0.12126876849887958, + "loss/policy_avg": 0.34828829765319824, + "lr": 9.192229038854807e-06, + "objective/entropy": -223.461669921875, + "objective/kl": 37.80855178833008, + "objective/non_score_reward": -1.890427589416504, + "objective/rlhf_reward": -9.561710357666016, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.94008731842041, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.791015625, + "step": 1264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980820417404175 + }, + { + "episode": 20256, + "epoch": 0.12136463313800913, + "loss/policy_avg": 0.7590650320053101, + "lr": 9.191589979550103e-06, + "objective/entropy": -212.02252197265625, + "objective/kl": 33.04069137573242, + "objective/non_score_reward": -1.6520345211029053, + "objective/rlhf_reward": -8.608138084411621, + "objective/scores": -0.5, + "policy/approxkl_avg": 19.59402084350586, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8125, + "step": 1265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 12, + "val/ratio": 2.0001893043518066 + }, + { + "episode": 20272, + "epoch": 0.12146049777713867, + "loss/policy_avg": 0.11782548576593399, + "lr": 9.1909509202454e-06, + "objective/entropy": -253.17308044433594, + "objective/kl": 45.47588348388672, + "objective/non_score_reward": -2.273794412612915, + "objective/rlhf_reward": -9.09517765045166, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.0098719596862793, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 1266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9997332096099854 + }, + { + "episode": 20288, + "epoch": 0.12155636241626823, + "loss/policy_avg": 0.22774580121040344, + "lr": 9.190311860940695e-06, + "objective/entropy": -232.32333374023438, + "objective/kl": 30.13494110107422, + "objective/non_score_reward": -1.5067470073699951, + "objective/rlhf_reward": -6.02698814868927, + "objective/scores": 0.0, + "policy/approxkl_avg": 70.37385559082031, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.767578125, + "step": 1267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 12, + "val/ratio": 1.997234582901001 + }, + { + "episode": 20304, + "epoch": 0.12165222705539779, + "loss/policy_avg": 0.2677008807659149, + "lr": 9.189672801635992e-06, + "objective/entropy": -243.69558715820312, + "objective/kl": 28.494319915771484, + "objective/non_score_reward": -1.4247161149978638, + "objective/rlhf_reward": -2.775145445705625, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 6.425749778747559, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62109375, + "step": 1268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999290943145752 + }, + { + "episode": 20320, + "epoch": 0.12174809169452733, + "loss/policy_avg": 1.1159042119979858, + "lr": 9.18903374233129e-06, + "objective/entropy": -88.82687377929688, + "objective/kl": 51.72290802001953, + "objective/non_score_reward": -2.5861456394195557, + "objective/rlhf_reward": -10.344582319259644, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5299274921417236, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.603515625, + "step": 1269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986238479614258 + }, + { + "episode": 20336, + "epoch": 0.12184395633365688, + "loss/policy_avg": 0.34005099534988403, + "lr": 9.188394683026586e-06, + "objective/entropy": -74.426513671875, + "objective/kl": 31.864864349365234, + "objective/non_score_reward": -1.5932432413101196, + "objective/rlhf_reward": -5.047460052996797, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 26.33525848388672, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.552734375, + "step": 1270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0006790161132812 + }, + { + "episode": 20352, + "epoch": 0.12193982097278643, + "loss/policy_avg": 34.044898986816406, + "lr": 9.187755623721883e-06, + "objective/entropy": -243.47544860839844, + "objective/kl": 33.51547622680664, + "objective/non_score_reward": -1.6757738590240479, + "objective/rlhf_reward": -6.703095495700836, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.789956092834473, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.75, + "step": 1271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0053799152374268 + }, + { + "episode": 20368, + "epoch": 0.12203568561191598, + "loss/policy_avg": 0.0031574219465255737, + "lr": 9.187116564417178e-06, + "objective/entropy": -204.58706665039062, + "objective/kl": 26.3686466217041, + "objective/non_score_reward": -1.318432331085205, + "objective/rlhf_reward": -3.849897225101558, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 1.2938048839569092, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6015625, + "step": 1272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0001134872436523 + }, + { + "episode": 20384, + "epoch": 0.12213155025104552, + "loss/policy_avg": 0.31501248478889465, + "lr": 9.186477505112475e-06, + "objective/entropy": -155.09539794921875, + "objective/kl": 43.250457763671875, + "objective/non_score_reward": -2.162522792816162, + "objective/rlhf_reward": -10.650091171264648, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.452056884765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 1273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9961915016174316 + }, + { + "episode": 20400, + "epoch": 0.12222741489017508, + "loss/policy_avg": -0.12414050102233887, + "lr": 9.185838445807772e-06, + "objective/entropy": -273.3624572753906, + "objective/kl": 42.63136291503906, + "objective/non_score_reward": -2.13156795501709, + "objective/rlhf_reward": -10.52627182006836, + "objective/scores": -0.5, + "policy/approxkl_avg": 16.34860610961914, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.73046875, + "step": 1274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0017499923706055 + }, + { + "episode": 20416, + "epoch": 0.12232327952930462, + "loss/policy_avg": 0.8902416825294495, + "lr": 9.185199386503069e-06, + "objective/entropy": -233.17616271972656, + "objective/kl": 37.79168701171875, + "objective/non_score_reward": -1.8895843029022217, + "objective/rlhf_reward": -5.825004235903421, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 6.202553749084473, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.58203125, + "step": 1275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000479221343994 + }, + { + "episode": 20432, + "epoch": 0.12241914416843418, + "loss/policy_avg": 0.3103522062301636, + "lr": 9.184560327198366e-06, + "objective/entropy": -145.01295471191406, + "objective/kl": 30.957489013671875, + "objective/non_score_reward": -1.5478745698928833, + "objective/rlhf_reward": -6.191498041152954, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.604130744934082, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.677734375, + "step": 1276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0142436027526855 + }, + { + "episode": 20448, + "epoch": 0.12251500880756372, + "loss/policy_avg": -0.15999212861061096, + "lr": 9.183921267893663e-06, + "objective/entropy": -265.859619140625, + "objective/kl": 39.34321975708008, + "objective/non_score_reward": -1.9671610593795776, + "objective/rlhf_reward": -7.8686442375183105, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.3606536388397217, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.705078125, + "step": 1277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989911317825317 + }, + { + "episode": 20464, + "epoch": 0.12261087344669327, + "loss/policy_avg": 0.14324238896369934, + "lr": 9.183282208588958e-06, + "objective/entropy": -173.67050170898438, + "objective/kl": 29.822010040283203, + "objective/non_score_reward": -1.491100549697876, + "objective/rlhf_reward": -5.964402079582214, + "objective/scores": 0.0, + "policy/approxkl_avg": 19.667259216308594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.744140625, + "step": 1278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975669384002686 + }, + { + "episode": 20480, + "epoch": 0.12270673808582282, + "loss/policy_avg": -0.20984116196632385, + "lr": 9.182643149284255e-06, + "objective/entropy": -196.99244689941406, + "objective/kl": 26.221698760986328, + "objective/non_score_reward": -1.3110849857330322, + "objective/rlhf_reward": -5.244340181350708, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.521638870239258, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5625, + "step": 1279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9990384578704834 + }, + { + "episode": 20496, + "epoch": 0.12280260272495237, + "loss/policy_avg": -0.25677821040153503, + "lr": 9.18200408997955e-06, + "objective/entropy": -182.93035888671875, + "objective/kl": 34.517967224121094, + "objective/non_score_reward": -1.7258983850479126, + "objective/rlhf_reward": -8.903593063354492, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.4066312313079834, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.701171875, + "step": 1280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0007028579711914 + }, + { + "episode": 20512, + "epoch": 0.12289846736408191, + "loss/policy_avg": 0.4508504867553711, + "lr": 9.181365030674847e-06, + "objective/entropy": -49.91909408569336, + "objective/kl": 45.33077621459961, + "objective/non_score_reward": -2.2665388584136963, + "objective/rlhf_reward": -11.066155433654785, + "objective/scores": -0.5, + "policy/approxkl_avg": 15.444662094116211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.744140625, + "step": 1281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9966447353363037 + }, + { + "episode": 20528, + "epoch": 0.12299433200321147, + "loss/policy_avg": 0.8163506984710693, + "lr": 9.180725971370144e-06, + "objective/entropy": -287.3531494140625, + "objective/kl": 44.37574768066406, + "objective/non_score_reward": -2.218787431716919, + "objective/rlhf_reward": -8.875149488449097, + "objective/scores": 0.0, + "policy/approxkl_avg": 14.398701667785645, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6640625, + "step": 1282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.999485731124878 + }, + { + "episode": 20544, + "epoch": 0.12309019664234101, + "loss/policy_avg": 1.833961844444275, + "lr": 9.18008691206544e-06, + "objective/entropy": -88.22166442871094, + "objective/kl": 52.44647979736328, + "objective/non_score_reward": -2.622323989868164, + "objective/rlhf_reward": -9.13004633161871, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 22.343944549560547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.619140625, + "step": 1283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9950611591339111 + }, + { + "episode": 20560, + "epoch": 0.12318606128147057, + "loss/policy_avg": 0.2040541172027588, + "lr": 9.179447852760737e-06, + "objective/entropy": -258.391357421875, + "objective/kl": 43.110595703125, + "objective/non_score_reward": -2.155529499053955, + "objective/rlhf_reward": -10.62211799621582, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.4909324645996094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62890625, + "step": 1284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0005974769592285 + }, + { + "episode": 20576, + "epoch": 0.12328192592060011, + "loss/policy_avg": 0.07932749390602112, + "lr": 9.178808793456033e-06, + "objective/entropy": -225.14065551757812, + "objective/kl": 34.00872802734375, + "objective/non_score_reward": -1.7004364728927612, + "objective/rlhf_reward": -6.801745653152466, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.444969415664673, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6484375, + "step": 1285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992835521697998 + }, + { + "episode": 20592, + "epoch": 0.12337779055972967, + "loss/policy_avg": 0.22031256556510925, + "lr": 9.17816973415133e-06, + "objective/entropy": -112.36615753173828, + "objective/kl": 43.09422302246094, + "objective/non_score_reward": -2.1547110080718994, + "objective/rlhf_reward": -7.308163275918364, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 82.84308624267578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.59375, + "step": 1286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9967961311340332 + }, + { + "episode": 20608, + "epoch": 0.12347365519885921, + "loss/policy_avg": 0.3500659167766571, + "lr": 9.177530674846626e-06, + "objective/entropy": -199.16552734375, + "objective/kl": 37.99430847167969, + "objective/non_score_reward": -1.899715542793274, + "objective/rlhf_reward": -6.1482641502336115, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 7.39799690246582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.701171875, + "step": 1287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9984194040298462 + }, + { + "episode": 20624, + "epoch": 0.12356951983798876, + "loss/policy_avg": -0.18040573596954346, + "lr": 9.176891615541923e-06, + "objective/entropy": -243.39022827148438, + "objective/kl": 39.545589447021484, + "objective/non_score_reward": -1.9772794246673584, + "objective/rlhf_reward": -7.909117579460144, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.830007553100586, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.82421875, + "step": 1288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.003035068511963 + }, + { + "episode": 20640, + "epoch": 0.1236653844771183, + "loss/policy_avg": 0.046221084892749786, + "lr": 9.17625255623722e-06, + "objective/entropy": -274.96441650390625, + "objective/kl": 32.39388656616211, + "objective/non_score_reward": -1.6196943521499634, + "objective/rlhf_reward": -5.078777408599853, + "objective/scores": 0.35, + "policy/approxkl_avg": 11.188572883605957, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5234375, + "step": 1289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989345073699951 + }, + { + "episode": 20656, + "epoch": 0.12376124911624786, + "loss/policy_avg": 0.16251739859580994, + "lr": 9.175613496932517e-06, + "objective/entropy": -224.05252075195312, + "objective/kl": 41.598114013671875, + "objective/non_score_reward": -2.0799057483673096, + "objective/rlhf_reward": -8.31962251663208, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.0546166896820068, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6171875, + "step": 1290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999380111694336 + }, + { + "episode": 20672, + "epoch": 0.1238571137553774, + "loss/policy_avg": 0.003011360764503479, + "lr": 9.174974437627812e-06, + "objective/entropy": -96.3555908203125, + "objective/kl": 30.438373565673828, + "objective/non_score_reward": -1.521918773651123, + "objective/rlhf_reward": -4.531415670123652, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.271953582763672, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.634765625, + "step": 1291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0003316402435303 + }, + { + "episode": 20688, + "epoch": 0.12395297839450696, + "loss/policy_avg": 0.07926920056343079, + "lr": 9.174335378323109e-06, + "objective/entropy": -200.49888610839844, + "objective/kl": 43.77037048339844, + "objective/non_score_reward": -2.188518524169922, + "objective/rlhf_reward": -10.754074096679688, + "objective/scores": -0.5, + "policy/approxkl_avg": 62.81719970703125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.544921875, + "step": 1292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.998753547668457 + }, + { + "episode": 20704, + "epoch": 0.1240488430336365, + "loss/policy_avg": 0.13393539190292358, + "lr": 9.173696319018406e-06, + "objective/entropy": -135.31887817382812, + "objective/kl": 34.098899841308594, + "objective/non_score_reward": -1.7049450874328613, + "objective/rlhf_reward": -2.4197802305221554, + "objective/scores": 1.1, + "policy/approxkl_avg": 32.906028747558594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.490234375, + "step": 1293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.996498942375183 + }, + { + "episode": 20720, + "epoch": 0.12414470767276606, + "loss/policy_avg": 0.5098394155502319, + "lr": 9.173057259713703e-06, + "objective/entropy": -260.03424072265625, + "objective/kl": 48.678131103515625, + "objective/non_score_reward": -2.4339070320129395, + "objective/rlhf_reward": -11.735628128051758, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.966676712036133, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.748046875, + "step": 1294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9996039867401123 + }, + { + "episode": 20736, + "epoch": 0.1242405723118956, + "loss/policy_avg": 0.12228544801473618, + "lr": 9.172418200409e-06, + "objective/entropy": -199.34173583984375, + "objective/kl": 34.90636444091797, + "objective/non_score_reward": -1.7453181743621826, + "objective/rlhf_reward": -5.5574405982094675, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 5.361681938171387, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.759765625, + "step": 1295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9989519119262695 + }, + { + "episode": 20752, + "epoch": 0.12433643695102516, + "loss/policy_avg": 0.40392011404037476, + "lr": 9.171779141104295e-06, + "objective/entropy": -228.970458984375, + "objective/kl": 40.59248352050781, + "objective/non_score_reward": -2.0296242237091064, + "objective/rlhf_reward": -10.118497848510742, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.923158645629883, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.583984375, + "step": 1296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987823963165283 + }, + { + "episode": 20768, + "epoch": 0.1244323015901547, + "loss/policy_avg": 0.3041171431541443, + "lr": 9.171140081799592e-06, + "objective/entropy": -254.57518005371094, + "objective/kl": 33.09609603881836, + "objective/non_score_reward": -1.6548048257827759, + "objective/rlhf_reward": -6.6192193031311035, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8256323337554932, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.748046875, + "step": 1297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994516372680664 + }, + { + "episode": 20784, + "epoch": 0.12452816622928425, + "loss/policy_avg": 0.6333913207054138, + "lr": 9.170501022494889e-06, + "objective/entropy": -37.06604766845703, + "objective/kl": 40.758399963378906, + "objective/non_score_reward": -2.0379199981689453, + "objective/rlhf_reward": -8.151679992675781, + "objective/scores": 0.0, + "policy/approxkl_avg": 16.388423919677734, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.736328125, + "step": 1298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9970452785491943 + }, + { + "episode": 20800, + "epoch": 0.1246240308684138, + "loss/policy_avg": 0.6460137963294983, + "lr": 9.169861963190185e-06, + "objective/entropy": -136.16278076171875, + "objective/kl": 38.85075378417969, + "objective/non_score_reward": -1.9425376653671265, + "objective/rlhf_reward": -7.770150780677795, + "objective/scores": 0.0, + "policy/approxkl_avg": 55.14060592651367, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5625, + "step": 1299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9994523525238037 + }, + { + "episode": 20816, + "epoch": 0.12471989550754335, + "loss/policy_avg": 0.5988568067550659, + "lr": 9.169222903885482e-06, + "objective/entropy": -259.0589599609375, + "objective/kl": 42.378814697265625, + "objective/non_score_reward": -2.118940591812134, + "objective/rlhf_reward": -10.475763320922852, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.4832820892333984, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.69140625, + "step": 1300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9995747804641724 + }, + { + "episode": 20832, + "epoch": 0.1248157601466729, + "loss/policy_avg": 0.835451602935791, + "lr": 9.168583844580777e-06, + "objective/entropy": -121.274658203125, + "objective/kl": 41.757080078125, + "objective/non_score_reward": -2.0878539085388184, + "objective/rlhf_reward": -8.351415753364563, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.3565473556518555, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65625, + "step": 1301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0015199184417725 + }, + { + "episode": 20848, + "epoch": 0.12491162478580245, + "loss/policy_avg": 0.07658547163009644, + "lr": 9.167944785276074e-06, + "objective/entropy": -217.8619842529297, + "objective/kl": 53.58893966674805, + "objective/non_score_reward": -2.6794471740722656, + "objective/rlhf_reward": -9.29395635863122, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 1.5108273029327393, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66796875, + "step": 1302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.998281717300415 + }, + { + "episode": 20864, + "epoch": 0.125007489424932, + "loss/policy_avg": 0.5005425810813904, + "lr": 9.167305725971371e-06, + "objective/entropy": -305.85546875, + "objective/kl": 35.98139953613281, + "objective/non_score_reward": -1.799070119857788, + "objective/rlhf_reward": -5.073573889509712, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 15.541563034057617, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.64453125, + "step": 1303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.998666763305664 + }, + { + "episode": 20880, + "epoch": 0.12510335406406153, + "loss/policy_avg": 1.3687680959701538, + "lr": 9.166666666666666e-06, + "objective/entropy": -244.98106384277344, + "objective/kl": 26.2752685546875, + "objective/non_score_reward": -1.3137634992599487, + "objective/rlhf_reward": -5.255053877830505, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.946475982666016, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.630859375, + "step": 1304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 26, + "val/ratio": 1.9993999004364014 + }, + { + "episode": 20896, + "epoch": 0.1251992187031911, + "loss/policy_avg": 0.09463109076023102, + "lr": 9.166027607361963e-06, + "objective/entropy": -252.9051055908203, + "objective/kl": 32.28279113769531, + "objective/non_score_reward": -1.6141395568847656, + "objective/rlhf_reward": -8.456558227539062, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.355483055114746, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.638671875, + "step": 1305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9991474151611328 + }, + { + "episode": 20912, + "epoch": 0.12529508334232065, + "loss/policy_avg": 0.12172050029039383, + "lr": 9.16538854805726e-06, + "objective/entropy": -204.6068115234375, + "objective/kl": 35.159080505371094, + "objective/non_score_reward": -1.7579538822174072, + "objective/rlhf_reward": -2.6318155288696286, + "objective/scores": 1.1, + "policy/approxkl_avg": 0.8247687816619873, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.646484375, + "step": 1306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000840902328491 + }, + { + "episode": 20928, + "epoch": 0.1253909479814502, + "loss/policy_avg": 0.2253878116607666, + "lr": 9.164749488752557e-06, + "objective/entropy": -186.3021697998047, + "objective/kl": 43.261077880859375, + "objective/non_score_reward": -2.1630539894104004, + "objective/rlhf_reward": -8.652215480804443, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6487679481506348, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6875, + "step": 1307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998298168182373 + }, + { + "episode": 20944, + "epoch": 0.12548681262057973, + "loss/policy_avg": 2.0130233764648438, + "lr": 9.164110429447854e-06, + "objective/entropy": -222.02183532714844, + "objective/kl": 38.97808837890625, + "objective/non_score_reward": -1.9489041566848755, + "objective/rlhf_reward": -5.395616686344146, + "objective/scores": 0.6, + "policy/approxkl_avg": 4.978163242340088, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.71875, + "step": 1308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9983195066452026 + }, + { + "episode": 20960, + "epoch": 0.12558267725970929, + "loss/policy_avg": 0.17725849151611328, + "lr": 9.163471370143149e-06, + "objective/entropy": -158.4937286376953, + "objective/kl": 44.14594268798828, + "objective/non_score_reward": -2.2072973251342773, + "objective/rlhf_reward": -8.82918930053711, + "objective/scores": 0.0, + "policy/approxkl_avg": 33.64825439453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.60546875, + "step": 1309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9982397556304932 + }, + { + "episode": 20976, + "epoch": 0.12567854189883884, + "loss/policy_avg": 0.32376429438591003, + "lr": 9.162832310838446e-06, + "objective/entropy": -204.19961547851562, + "objective/kl": 45.31929016113281, + "objective/non_score_reward": -2.2659645080566406, + "objective/rlhf_reward": -7.663857316970825, + "objective/scores": 0.35, + "policy/approxkl_avg": 9.415895462036133, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5625, + "step": 1310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999131202697754 + }, + { + "episode": 20992, + "epoch": 0.1257744065379684, + "loss/policy_avg": 0.12458442151546478, + "lr": 9.162193251533743e-06, + "objective/entropy": -265.5741271972656, + "objective/kl": 35.61662292480469, + "objective/non_score_reward": -1.780831217765808, + "objective/rlhf_reward": -9.12332534790039, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.166614532470703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.654296875, + "step": 1311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0003037452697754 + }, + { + "episode": 21008, + "epoch": 0.12587027117709793, + "loss/policy_avg": 0.2639521658420563, + "lr": 9.16155419222904e-06, + "objective/entropy": -239.51901245117188, + "objective/kl": 39.99544906616211, + "objective/non_score_reward": -1.9997724294662476, + "objective/rlhf_reward": -7.999090075492859, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.698028564453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.810546875, + "step": 1312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.997701644897461 + }, + { + "episode": 21024, + "epoch": 0.12596613581622748, + "loss/policy_avg": 0.3248225450515747, + "lr": 9.160915132924337e-06, + "objective/entropy": -133.79931640625, + "objective/kl": 48.588706970214844, + "objective/non_score_reward": -2.4294352531433105, + "objective/rlhf_reward": -11.717741012573242, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.692508697509766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.623046875, + "step": 1313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999791145324707 + }, + { + "episode": 21040, + "epoch": 0.12606200045535704, + "loss/policy_avg": 0.1259753704071045, + "lr": 9.160276073619634e-06, + "objective/entropy": -214.23797607421875, + "objective/kl": 33.7969970703125, + "objective/non_score_reward": -1.689850091934204, + "objective/rlhf_reward": -6.759400129318237, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.2487107515335083, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.740234375, + "step": 1314, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0073482990264893 + }, + { + "episode": 21056, + "epoch": 0.1261578650944866, + "loss/policy_avg": 0.15544453263282776, + "lr": 9.159637014314929e-06, + "objective/entropy": -236.80731201171875, + "objective/kl": 40.89076614379883, + "objective/non_score_reward": -2.0445382595062256, + "objective/rlhf_reward": -8.17815351486206, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.472908854484558, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.634765625, + "step": 1315, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0021939277648926 + }, + { + "episode": 21072, + "epoch": 0.12625372973361612, + "loss/policy_avg": 0.2730838656425476, + "lr": 9.158997955010226e-06, + "objective/entropy": -245.7431640625, + "objective/kl": 36.503013610839844, + "objective/non_score_reward": -1.825150728225708, + "objective/rlhf_reward": -5.941352927421017, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 4.245136260986328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.529296875, + "step": 1316, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000441789627075 + }, + { + "episode": 21088, + "epoch": 0.12634959437274568, + "loss/policy_avg": 0.8404061198234558, + "lr": 9.158358895705522e-06, + "objective/entropy": -110.63333892822266, + "objective/kl": 38.572914123535156, + "objective/non_score_reward": -1.9286456108093262, + "objective/rlhf_reward": -6.4039015676587825, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 6.045080661773682, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.615234375, + "step": 1317, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0006213188171387 + }, + { + "episode": 21104, + "epoch": 0.12644545901187523, + "loss/policy_avg": 0.1824544370174408, + "lr": 9.15771983640082e-06, + "objective/entropy": -256.66400146484375, + "objective/kl": 44.36193084716797, + "objective/non_score_reward": -2.2180964946746826, + "objective/rlhf_reward": -8.87238597869873, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7598674297332764, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.69140625, + "step": 1318, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999514102935791 + }, + { + "episode": 21120, + "epoch": 0.1265413236510048, + "loss/policy_avg": -0.17376762628555298, + "lr": 9.157080777096116e-06, + "objective/entropy": -233.87896728515625, + "objective/kl": 41.74799346923828, + "objective/non_score_reward": -2.08739972114563, + "objective/rlhf_reward": -10.349599838256836, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.0124239921569824, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.638671875, + "step": 1319, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988930225372314 + }, + { + "episode": 21136, + "epoch": 0.12663718829013432, + "loss/policy_avg": 0.24892321228981018, + "lr": 9.156441717791411e-06, + "objective/entropy": -223.25335693359375, + "objective/kl": 41.1420783996582, + "objective/non_score_reward": -2.0571041107177734, + "objective/rlhf_reward": -10.228416442871094, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.74615478515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.689453125, + "step": 1320, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9998877048492432 + }, + { + "episode": 21152, + "epoch": 0.12673305292926387, + "loss/policy_avg": -0.04218818619847298, + "lr": 9.155802658486708e-06, + "objective/entropy": -73.82662963867188, + "objective/kl": 41.3078727722168, + "objective/non_score_reward": -2.0653934478759766, + "objective/rlhf_reward": -6.436745639118264, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 3.6249895095825195, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.708984375, + "step": 1321, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982481002807617 + }, + { + "episode": 21168, + "epoch": 0.12682891756839343, + "loss/policy_avg": -0.10386607050895691, + "lr": 9.155163599182005e-06, + "objective/entropy": -251.12631225585938, + "objective/kl": 44.060638427734375, + "objective/non_score_reward": -2.2030317783355713, + "objective/rlhf_reward": -8.812127232551575, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.3046770095825195, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.673828125, + "step": 1322, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9990068674087524 + }, + { + "episode": 21184, + "epoch": 0.12692478220752298, + "loss/policy_avg": 1.2251129150390625, + "lr": 9.1545245398773e-06, + "objective/entropy": -217.22891235351562, + "objective/kl": 45.271419525146484, + "objective/non_score_reward": -2.263571262359619, + "objective/rlhf_reward": -9.05428433418274, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8608626127243042, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.705078125, + "step": 1323, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0005476474761963 + }, + { + "episode": 21200, + "epoch": 0.1270206468466525, + "loss/policy_avg": 0.7113258242607117, + "lr": 9.153885480572597e-06, + "objective/entropy": -222.394287109375, + "objective/kl": 38.936744689941406, + "objective/non_score_reward": -1.9468371868133545, + "objective/rlhf_reward": -9.787348747253418, + "objective/scores": -0.5, + "policy/approxkl_avg": 19.650562286376953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.67578125, + "step": 1324, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9962797164916992 + }, + { + "episode": 21216, + "epoch": 0.12711651148578207, + "loss/policy_avg": 0.23568665981292725, + "lr": 9.153246421267894e-06, + "objective/entropy": -195.58255004882812, + "objective/kl": 35.354164123535156, + "objective/non_score_reward": -1.7677080631256104, + "objective/rlhf_reward": -7.0708324909210205, + "objective/scores": 0.0, + "policy/approxkl_avg": 36.592811584472656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.755859375, + "step": 1325, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.998015284538269 + }, + { + "episode": 21232, + "epoch": 0.12721237612491162, + "loss/policy_avg": 1.0861258506774902, + "lr": 9.152607361963191e-06, + "objective/entropy": -159.83738708496094, + "objective/kl": 55.35202407836914, + "objective/non_score_reward": -2.767601490020752, + "objective/rlhf_reward": -9.670405244827272, + "objective/scores": 0.35, + "policy/approxkl_avg": 11.508237838745117, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.75, + "step": 1326, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.002288341522217 + }, + { + "episode": 21248, + "epoch": 0.12730824076404118, + "loss/policy_avg": 0.3134673833847046, + "lr": 9.151968302658488e-06, + "objective/entropy": -248.45355224609375, + "objective/kl": 36.30670928955078, + "objective/non_score_reward": -1.8153355121612549, + "objective/rlhf_reward": -7.261342406272888, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.159135341644287, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.703125, + "step": 1327, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998664140701294 + }, + { + "episode": 21264, + "epoch": 0.12740410540317074, + "loss/policy_avg": 0.3148130178451538, + "lr": 9.151329243353783e-06, + "objective/entropy": -28.954387664794922, + "objective/kl": 28.350412368774414, + "objective/non_score_reward": -1.4175206422805786, + "objective/rlhf_reward": -1.2700828075408932, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.8836774826049805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.791015625, + "step": 1328, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973502159118652 + }, + { + "episode": 21280, + "epoch": 0.12749997004230026, + "loss/policy_avg": 0.42025741934776306, + "lr": 9.15069018404908e-06, + "objective/entropy": -104.76248168945312, + "objective/kl": 37.52256774902344, + "objective/non_score_reward": -1.8761284351348877, + "objective/rlhf_reward": -9.504514694213867, + "objective/scores": -0.5, + "policy/approxkl_avg": 104.57810974121094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.806640625, + "step": 1329, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0003468990325928 + }, + { + "episode": 21296, + "epoch": 0.12759583468142982, + "loss/policy_avg": 0.1600969433784485, + "lr": 9.150051124744377e-06, + "objective/entropy": -188.2863311767578, + "objective/kl": 31.741531372070312, + "objective/non_score_reward": -1.5870766639709473, + "objective/rlhf_reward": -8.348306655883789, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.479442596435547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.693359375, + "step": 1330, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.996354103088379 + }, + { + "episode": 21312, + "epoch": 0.12769169932055938, + "loss/policy_avg": -0.08294879645109177, + "lr": 9.149412065439674e-06, + "objective/entropy": -220.495361328125, + "objective/kl": 35.95433807373047, + "objective/non_score_reward": -1.7977170944213867, + "objective/rlhf_reward": -9.190868377685547, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.635646343231201, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.666015625, + "step": 1331, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0019330978393555 + }, + { + "episode": 21328, + "epoch": 0.12778756395968893, + "loss/policy_avg": 0.10603788495063782, + "lr": 9.14877300613497e-06, + "objective/entropy": -48.090450286865234, + "objective/kl": 26.891372680664062, + "objective/non_score_reward": -1.3445687294006348, + "objective/rlhf_reward": -7.378274917602539, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.43210220336914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 1332, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0011115074157715 + }, + { + "episode": 21344, + "epoch": 0.12788342859881846, + "loss/policy_avg": 0.30039137601852417, + "lr": 9.148133946830266e-06, + "objective/entropy": -199.95272827148438, + "objective/kl": 32.25083541870117, + "objective/non_score_reward": -1.612541913986206, + "objective/rlhf_reward": -2.050167655944824, + "objective/scores": 1.1, + "policy/approxkl_avg": 40.17951202392578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.71875, + "step": 1333, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9978996515274048 + }, + { + "episode": 21360, + "epoch": 0.12797929323794802, + "loss/policy_avg": 0.12805229425430298, + "lr": 9.147494887525563e-06, + "objective/entropy": -210.07606506347656, + "objective/kl": 27.649646759033203, + "objective/non_score_reward": -1.3824822902679443, + "objective/rlhf_reward": -5.5299293994903564, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.75002384185791, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69921875, + "step": 1334, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9982640743255615 + }, + { + "episode": 21376, + "epoch": 0.12807515787707757, + "loss/policy_avg": -0.07147381454706192, + "lr": 9.14685582822086e-06, + "objective/entropy": -209.8764190673828, + "objective/kl": 32.452796936035156, + "objective/non_score_reward": -1.6226398944854736, + "objective/rlhf_reward": -5.165046367674989, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 0.3909912109375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.66796875, + "step": 1335, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.002310276031494 + }, + { + "episode": 21392, + "epoch": 0.12817102251620713, + "loss/policy_avg": 0.10854579508304596, + "lr": 9.146216768916156e-06, + "objective/entropy": -249.07809448242188, + "objective/kl": 40.571285247802734, + "objective/non_score_reward": -2.028564214706421, + "objective/rlhf_reward": -6.510136995379048, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 7.510847091674805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.736328125, + "step": 1336, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9966883659362793 + }, + { + "episode": 21408, + "epoch": 0.12826688715533666, + "loss/policy_avg": 0.2952539026737213, + "lr": 9.145577709611453e-06, + "objective/entropy": -210.13613891601562, + "objective/kl": 38.8017578125, + "objective/non_score_reward": -1.9400880336761475, + "objective/rlhf_reward": -7.760351896286011, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.842449188232422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.720703125, + "step": 1337, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9967563152313232 + }, + { + "episode": 21424, + "epoch": 0.1283627517944662, + "loss/policy_avg": -0.016138926148414612, + "lr": 9.14493865030675e-06, + "objective/entropy": -200.32778930664062, + "objective/kl": 27.706912994384766, + "objective/non_score_reward": -1.385345697402954, + "objective/rlhf_reward": -4.117550869186489, + "objective/scores": 0.35595802480981553, + "policy/approxkl_avg": 2.4525434970855713, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59375, + "step": 1338, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994597434997559 + }, + { + "episode": 21440, + "epoch": 0.12845861643359577, + "loss/policy_avg": 0.1709384322166443, + "lr": 9.144299591002045e-06, + "objective/entropy": -213.44891357421875, + "objective/kl": 35.66492462158203, + "objective/non_score_reward": -1.7832460403442383, + "objective/rlhf_reward": -7.132984519004822, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.60143756866455, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.73828125, + "step": 1339, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990102052688599 + }, + { + "episode": 21456, + "epoch": 0.12855448107272532, + "loss/policy_avg": 0.4829360842704773, + "lr": 9.143660531697342e-06, + "objective/entropy": -210.03842163085938, + "objective/kl": 33.9603271484375, + "objective/non_score_reward": -1.6980164051055908, + "objective/rlhf_reward": -3.8683466061365337, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.802133560180664, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.875, + "step": 1340, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9986188411712646 + }, + { + "episode": 21472, + "epoch": 0.12865034571185485, + "loss/policy_avg": 0.01272507756948471, + "lr": 9.143021472392639e-06, + "objective/entropy": -251.24383544921875, + "objective/kl": 44.83313751220703, + "objective/non_score_reward": -2.24165678024292, + "objective/rlhf_reward": -8.96662712097168, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.942490816116333, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.75, + "step": 1341, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001084804534912 + }, + { + "episode": 21488, + "epoch": 0.1287462103509844, + "loss/policy_avg": 0.14268061518669128, + "lr": 9.142382413087936e-06, + "objective/entropy": -294.4553527832031, + "objective/kl": 30.44989013671875, + "objective/non_score_reward": -1.5224944353103638, + "objective/rlhf_reward": -6.089977741241455, + "objective/scores": 0.0, + "policy/approxkl_avg": 11.286410331726074, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6484375, + "step": 1342, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.999746322631836 + }, + { + "episode": 21504, + "epoch": 0.12884207499011396, + "loss/policy_avg": 0.23312777280807495, + "lr": 9.141743353783233e-06, + "objective/entropy": -208.1106414794922, + "objective/kl": 29.394561767578125, + "objective/non_score_reward": -1.4697279930114746, + "objective/rlhf_reward": -3.478912210464477, + "objective/scores": 0.6, + "policy/approxkl_avg": 3.9597384929656982, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.787109375, + "step": 1343, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9970589876174927 + }, + { + "episode": 21520, + "epoch": 0.12893793962924352, + "loss/policy_avg": 0.3072141408920288, + "lr": 9.14110429447853e-06, + "objective/entropy": -150.29653930664062, + "objective/kl": 34.47722625732422, + "objective/non_score_reward": -1.7238614559173584, + "objective/rlhf_reward": -4.495445704460144, + "objective/scores": 0.6, + "policy/approxkl_avg": 9.984395980834961, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.697265625, + "step": 1344, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9992399215698242 + }, + { + "episode": 21536, + "epoch": 0.12903380426837305, + "loss/policy_avg": -0.01569405198097229, + "lr": 9.140465235173825e-06, + "objective/entropy": -204.17913818359375, + "objective/kl": 34.517784118652344, + "objective/non_score_reward": -1.7258893251419067, + "objective/rlhf_reward": -8.903556823730469, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.9644145965576172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 1345, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0025582313537598 + }, + { + "episode": 21552, + "epoch": 0.1291296689075026, + "loss/policy_avg": 0.6427126526832581, + "lr": 9.13982617586912e-06, + "objective/entropy": -78.82882690429688, + "objective/kl": 43.22979736328125, + "objective/non_score_reward": -2.161489963531494, + "objective/rlhf_reward": -6.912626401583353, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 3.5858652591705322, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.568359375, + "step": 1346, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9986443519592285 + }, + { + "episode": 21568, + "epoch": 0.12922553354663216, + "loss/policy_avg": 0.07235066592693329, + "lr": 9.139187116564417e-06, + "objective/entropy": -196.06838989257812, + "objective/kl": 28.267013549804688, + "objective/non_score_reward": -1.4133508205413818, + "objective/rlhf_reward": -5.653403162956238, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.8308835029602051, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.59765625, + "step": 1347, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001713514328003 + }, + { + "episode": 21584, + "epoch": 0.12932139818576172, + "loss/policy_avg": 0.012343340553343296, + "lr": 9.138548057259714e-06, + "objective/entropy": -182.97830200195312, + "objective/kl": 29.497230529785156, + "objective/non_score_reward": -1.4748615026474, + "objective/rlhf_reward": -7.8994460105896, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.9755702018737793, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65625, + "step": 1348, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9990975856781006 + }, + { + "episode": 21600, + "epoch": 0.12941726282489124, + "loss/policy_avg": 0.1750022917985916, + "lr": 9.13790899795501e-06, + "objective/entropy": -231.31161499023438, + "objective/kl": 35.02555465698242, + "objective/non_score_reward": -1.7512778043746948, + "objective/rlhf_reward": -7.005111217498779, + "objective/scores": 0.0, + "policy/approxkl_avg": 61.47712707519531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.755859375, + "step": 1349, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.998092770576477 + }, + { + "episode": 21616, + "epoch": 0.1295131274640208, + "loss/policy_avg": 0.37204813957214355, + "lr": 9.137269938650308e-06, + "objective/entropy": -223.54615783691406, + "objective/kl": 34.17963409423828, + "objective/non_score_reward": -1.7089817523956299, + "objective/rlhf_reward": -6.83592677116394, + "objective/scores": 0.0, + "policy/approxkl_avg": 40.581817626953125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.65625, + "step": 1350, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0005273818969727 + }, + { + "episode": 21632, + "epoch": 0.12960899210315036, + "loss/policy_avg": 0.009270786307752132, + "lr": 9.136630879345604e-06, + "objective/entropy": -217.45834350585938, + "objective/kl": 40.40803527832031, + "objective/non_score_reward": -2.020401954650879, + "objective/rlhf_reward": -6.681608414649963, + "objective/scores": 0.35, + "policy/approxkl_avg": 136.1207275390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.623046875, + "step": 1351, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985735416412354 + }, + { + "episode": 21648, + "epoch": 0.1297048567422799, + "loss/policy_avg": 0.42100289463996887, + "lr": 9.1359918200409e-06, + "objective/entropy": -185.81472778320312, + "objective/kl": 44.553558349609375, + "objective/non_score_reward": -2.227677822113037, + "objective/rlhf_reward": -4.510711407661438, + "objective/scores": 1.1, + "policy/approxkl_avg": 3.366624355316162, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 1352, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999260663986206 + }, + { + "episode": 21664, + "epoch": 0.12980072138140944, + "loss/policy_avg": 0.317403644323349, + "lr": 9.135352760736197e-06, + "objective/entropy": -129.45350646972656, + "objective/kl": 40.61649703979492, + "objective/non_score_reward": -2.030824661254883, + "objective/rlhf_reward": -8.12329888343811, + "objective/scores": 0.0, + "policy/approxkl_avg": 111.90044403076172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.703125, + "step": 1353, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9951592683792114 + }, + { + "episode": 21680, + "epoch": 0.129896586020539, + "loss/policy_avg": 0.042517438530921936, + "lr": 9.134713701431493e-06, + "objective/entropy": -204.95616149902344, + "objective/kl": 33.41504669189453, + "objective/non_score_reward": -1.6707521677017212, + "objective/rlhf_reward": -4.283008790016174, + "objective/scores": 0.6, + "policy/approxkl_avg": 10.436746597290039, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.595703125, + "step": 1354, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.022730827331543 + }, + { + "episode": 21696, + "epoch": 0.12999245065966855, + "loss/policy_avg": 0.10988529771566391, + "lr": 9.13407464212679e-06, + "objective/entropy": -235.26551818847656, + "objective/kl": 40.63349151611328, + "objective/non_score_reward": -2.03167462348938, + "objective/rlhf_reward": -8.126698017120361, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3937795162200928, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69140625, + "step": 1355, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9983268976211548 + }, + { + "episode": 21712, + "epoch": 0.1300883152987981, + "loss/policy_avg": 1.0233821868896484, + "lr": 9.133435582822087e-06, + "objective/entropy": -97.07902526855469, + "objective/kl": 38.8305549621582, + "objective/non_score_reward": -1.9415278434753418, + "objective/rlhf_reward": -9.766111373901367, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.853279113769531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.787109375, + "step": 1356, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.999967336654663 + }, + { + "episode": 21728, + "epoch": 0.13018417993792764, + "loss/policy_avg": 1.16000235080719, + "lr": 9.132796523517384e-06, + "objective/entropy": -236.8131103515625, + "objective/kl": 35.032630920410156, + "objective/non_score_reward": -1.751631498336792, + "objective/rlhf_reward": -7.006525874137878, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.125271797180176, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.61328125, + "step": 1357, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9987199306488037 + }, + { + "episode": 21744, + "epoch": 0.1302800445770572, + "loss/policy_avg": -0.0014195814728736877, + "lr": 9.13215746421268e-06, + "objective/entropy": -225.53182983398438, + "objective/kl": 32.558837890625, + "objective/non_score_reward": -1.6279419660568237, + "objective/rlhf_reward": -6.511767864227295, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.403553009033203, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 1358, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.002810001373291 + }, + { + "episode": 21760, + "epoch": 0.13037590921618675, + "loss/policy_avg": 0.3156861960887909, + "lr": 9.131518404907976e-06, + "objective/entropy": -84.26490020751953, + "objective/kl": 48.92396545410156, + "objective/non_score_reward": -2.446197986602783, + "objective/rlhf_reward": -7.38479266166687, + "objective/scores": 0.6, + "policy/approxkl_avg": 27.199058532714844, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.59765625, + "step": 1359, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987435340881348 + }, + { + "episode": 21776, + "epoch": 0.1304717738553163, + "loss/policy_avg": 0.09869790822267532, + "lr": 9.130879345603273e-06, + "objective/entropy": -180.8636474609375, + "objective/kl": 35.70021057128906, + "objective/non_score_reward": -1.785010576248169, + "objective/rlhf_reward": -7.140042304992676, + "objective/scores": 0.0, + "policy/approxkl_avg": 55.01975631713867, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.552734375, + "step": 1360, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0002808570861816 + }, + { + "episode": 21792, + "epoch": 0.13056763849444583, + "loss/policy_avg": -0.008832663297653198, + "lr": 9.13024028629857e-06, + "objective/entropy": -186.73507690429688, + "objective/kl": 47.136600494384766, + "objective/non_score_reward": -2.35683012008667, + "objective/rlhf_reward": -9.427320122718811, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.234107494354248, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.654296875, + "step": 1361, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000197410583496 + }, + { + "episode": 21808, + "epoch": 0.1306635031335754, + "loss/policy_avg": 0.06845638155937195, + "lr": 9.129601226993867e-06, + "objective/entropy": -170.82791137695312, + "objective/kl": 41.89483642578125, + "objective/non_score_reward": -2.0947418212890625, + "objective/rlhf_reward": -8.378967881202698, + "objective/scores": 0.0, + "policy/approxkl_avg": 33.29808807373047, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.71875, + "step": 1362, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.997246265411377 + }, + { + "episode": 21824, + "epoch": 0.13075936777270494, + "loss/policy_avg": -0.3103432357311249, + "lr": 9.128962167689162e-06, + "objective/entropy": -188.14288330078125, + "objective/kl": 43.23204803466797, + "objective/non_score_reward": -2.16160249710083, + "objective/rlhf_reward": -10.64640998840332, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.4708389043807983, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7109375, + "step": 1363, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9997010231018066 + }, + { + "episode": 21840, + "epoch": 0.1308552324118345, + "loss/policy_avg": 0.04574498161673546, + "lr": 9.128323108384459e-06, + "objective/entropy": -225.43228149414062, + "objective/kl": 35.95550537109375, + "objective/non_score_reward": -1.7977752685546875, + "objective/rlhf_reward": -7.19110107421875, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.6216052174568176, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.70703125, + "step": 1364, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0010831356048584 + }, + { + "episode": 21856, + "epoch": 0.13095109705096403, + "loss/policy_avg": -0.003664642572402954, + "lr": 9.127684049079756e-06, + "objective/entropy": -87.38859558105469, + "objective/kl": 40.37055969238281, + "objective/non_score_reward": -2.0185279846191406, + "objective/rlhf_reward": -6.469992432657795, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 6.654456615447998, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.615234375, + "step": 1365, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0019521713256836 + }, + { + "episode": 21872, + "epoch": 0.13104696169009358, + "loss/policy_avg": 0.19343380630016327, + "lr": 9.127044989775053e-06, + "objective/entropy": -285.2890930175781, + "objective/kl": 30.914960861206055, + "objective/non_score_reward": -1.545747995376587, + "objective/rlhf_reward": -8.182991981506348, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.2243709564209, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.552734375, + "step": 1366, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9995231628417969 + }, + { + "episode": 21888, + "epoch": 0.13114282632922314, + "loss/policy_avg": -0.3367545008659363, + "lr": 9.126405930470348e-06, + "objective/entropy": -174.12828063964844, + "objective/kl": 41.601070404052734, + "objective/non_score_reward": -2.0800533294677734, + "objective/rlhf_reward": -8.320213675498962, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.687625408172607, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.720703125, + "step": 1367, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0008833408355713 + }, + { + "episode": 21904, + "epoch": 0.1312386909683527, + "loss/policy_avg": -0.055573105812072754, + "lr": 9.125766871165645e-06, + "objective/entropy": -197.14544677734375, + "objective/kl": 34.075538635253906, + "objective/non_score_reward": -1.7037768363952637, + "objective/rlhf_reward": -4.867696235852177, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 7.123730659484863, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8203125, + "step": 1368, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.002974033355713 + }, + { + "episode": 21920, + "epoch": 0.13133455560748222, + "loss/policy_avg": 1.4661061763763428, + "lr": 9.125127811860942e-06, + "objective/entropy": -233.4633026123047, + "objective/kl": 18.249441146850586, + "objective/non_score_reward": -0.9124720096588135, + "objective/rlhf_reward": -3.649888038635254, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.0828328132629395, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.755859375, + "step": 1369, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001650094985962 + }, + { + "episode": 21936, + "epoch": 0.13143042024661178, + "loss/policy_avg": -0.08706043660640717, + "lr": 9.124488752556238e-06, + "objective/entropy": -228.32534790039062, + "objective/kl": 31.26093292236328, + "objective/non_score_reward": -1.5630466938018799, + "objective/rlhf_reward": -1.8521866559982296, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.4632827043533325, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 1370, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.00002384185791 + }, + { + "episode": 21952, + "epoch": 0.13152628488574133, + "loss/policy_avg": 0.05965416133403778, + "lr": 9.123849693251534e-06, + "objective/entropy": -187.98910522460938, + "objective/kl": 26.322303771972656, + "objective/non_score_reward": -1.3161152601242065, + "objective/rlhf_reward": -0.8644610404968258, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.225238800048828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58984375, + "step": 1371, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9994935989379883 + }, + { + "episode": 21968, + "epoch": 0.1316221495248709, + "loss/policy_avg": -0.11005878448486328, + "lr": 9.12321063394683e-06, + "objective/entropy": -158.16270446777344, + "objective/kl": 40.21120834350586, + "objective/non_score_reward": -2.0105605125427246, + "objective/rlhf_reward": -10.042242050170898, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.171478271484375, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.703125, + "step": 1372, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0031611919403076 + }, + { + "episode": 21984, + "epoch": 0.13171801416400042, + "loss/policy_avg": 0.06528766453266144, + "lr": 9.122571574642127e-06, + "objective/entropy": -74.28417205810547, + "objective/kl": 43.48757553100586, + "objective/non_score_reward": -2.1743788719177246, + "objective/rlhf_reward": -10.697515487670898, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.4419310092926025, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.802734375, + "step": 1373, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0019640922546387 + }, + { + "episode": 22000, + "epoch": 0.13181387880312997, + "loss/policy_avg": 0.44947099685668945, + "lr": 9.121932515337424e-06, + "objective/entropy": -207.0897216796875, + "objective/kl": 32.26081466674805, + "objective/non_score_reward": -1.6130409240722656, + "objective/rlhf_reward": -6.452163398265839, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.64161491394043, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.828125, + "step": 1374, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9975173473358154 + }, + { + "episode": 22016, + "epoch": 0.13190974344225953, + "loss/policy_avg": 0.4614018201828003, + "lr": 9.121293456032721e-06, + "objective/entropy": -93.75225830078125, + "objective/kl": 37.148765563964844, + "objective/non_score_reward": -1.857438325881958, + "objective/rlhf_reward": -9.429753303527832, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.6527957916259766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.705078125, + "step": 1375, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0026564598083496 + }, + { + "episode": 22032, + "epoch": 0.1320056080813891, + "loss/policy_avg": 0.1266087293624878, + "lr": 9.120654396728016e-06, + "objective/entropy": -237.70501708984375, + "objective/kl": 28.604475021362305, + "objective/non_score_reward": -1.4302237033843994, + "objective/rlhf_reward": -3.9875616590181986, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.0936261415481567, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.697265625, + "step": 1376, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001415729522705 + }, + { + "episode": 22048, + "epoch": 0.13210147272051861, + "loss/policy_avg": 0.00786939263343811, + "lr": 9.120015337423313e-06, + "objective/entropy": -206.1710662841797, + "objective/kl": 49.45941162109375, + "objective/non_score_reward": -2.472970485687256, + "objective/rlhf_reward": -9.891881823539734, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.474703311920166, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.591796875, + "step": 1377, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9989300966262817 + }, + { + "episode": 22064, + "epoch": 0.13219733735964817, + "loss/policy_avg": 0.14760851860046387, + "lr": 9.11937627811861e-06, + "objective/entropy": -264.64984130859375, + "objective/kl": 43.790306091308594, + "objective/non_score_reward": -2.1895155906677246, + "objective/rlhf_reward": -10.758062362670898, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.991140365600586, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7265625, + "step": 1378, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9970654249191284 + }, + { + "episode": 22080, + "epoch": 0.13229320199877773, + "loss/policy_avg": -0.021399877965450287, + "lr": 9.118737218813907e-06, + "objective/entropy": -157.84242248535156, + "objective/kl": 49.67934799194336, + "objective/non_score_reward": -2.4839675426483154, + "objective/rlhf_reward": -9.935869693756104, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.979637622833252, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.75, + "step": 1379, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999756813049316 + }, + { + "episode": 22096, + "epoch": 0.13238906663790728, + "loss/policy_avg": 0.06770411133766174, + "lr": 9.118098159509204e-06, + "objective/entropy": -265.515869140625, + "objective/kl": 41.99637222290039, + "objective/non_score_reward": -2.099818706512451, + "objective/rlhf_reward": -10.399274826049805, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.6817909479141235, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.771484375, + "step": 1380, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0022201538085938 + }, + { + "episode": 22112, + "epoch": 0.1324849312770368, + "loss/policy_avg": 0.3092069625854492, + "lr": 9.1174591002045e-06, + "objective/entropy": -231.37794494628906, + "objective/kl": 32.23219299316406, + "objective/non_score_reward": -1.611609697341919, + "objective/rlhf_reward": -4.842318568293171, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 14.457145690917969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7109375, + "step": 1381, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9975634813308716 + }, + { + "episode": 22128, + "epoch": 0.13258079591616637, + "loss/policy_avg": 0.06873491406440735, + "lr": 9.116820040899796e-06, + "objective/entropy": -137.2312469482422, + "objective/kl": 45.7124137878418, + "objective/non_score_reward": -2.28562068939209, + "objective/rlhf_reward": -11.14248275756836, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.212198257446289, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 1382, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980733394622803 + }, + { + "episode": 22144, + "epoch": 0.13267666055529592, + "loss/policy_avg": -0.08160250633955002, + "lr": 9.116180981595093e-06, + "objective/entropy": -216.74615478515625, + "objective/kl": 36.32727813720703, + "objective/non_score_reward": -1.816364049911499, + "objective/rlhf_reward": -9.265456199645996, + "objective/scores": -0.5, + "policy/approxkl_avg": 90.0169448852539, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8828125, + "step": 1383, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0009403228759766 + }, + { + "episode": 22160, + "epoch": 0.13277252519442548, + "loss/policy_avg": -0.008119195699691772, + "lr": 9.11554192229039e-06, + "objective/entropy": -201.57730102539062, + "objective/kl": 30.65857696533203, + "objective/non_score_reward": -1.532928705215454, + "objective/rlhf_reward": -4.4698555521374805, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 23.816858291625977, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7578125, + "step": 1384, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0028131008148193 + }, + { + "episode": 22176, + "epoch": 0.13286838983355503, + "loss/policy_avg": 0.6478606462478638, + "lr": 9.114902862985686e-06, + "objective/entropy": -143.7909698486328, + "objective/kl": 33.66261291503906, + "objective/non_score_reward": -1.6831307411193848, + "objective/rlhf_reward": -6.73252272605896, + "objective/scores": 0.0, + "policy/approxkl_avg": 18.739097595214844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.765625, + "step": 1385, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0000014305114746 + }, + { + "episode": 22192, + "epoch": 0.13296425447268456, + "loss/policy_avg": 0.9258921146392822, + "lr": 9.114263803680983e-06, + "objective/entropy": -132.45114135742188, + "objective/kl": 39.32126998901367, + "objective/non_score_reward": -1.9660634994506836, + "objective/rlhf_reward": -3.464254236221313, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.645174980163574, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.572265625, + "step": 1386, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.0007033348083496 + }, + { + "episode": 22208, + "epoch": 0.13306011911181412, + "loss/policy_avg": -0.05567781999707222, + "lr": 9.113624744376279e-06, + "objective/entropy": -248.31585693359375, + "objective/kl": 31.968358993530273, + "objective/non_score_reward": -1.5984179973602295, + "objective/rlhf_reward": -8.393671989440918, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.8747513294219971, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.69140625, + "step": 1387, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000096321105957 + }, + { + "episode": 22224, + "epoch": 0.13315598375094367, + "loss/policy_avg": -0.03815114498138428, + "lr": 9.112985685071575e-06, + "objective/entropy": -192.3355712890625, + "objective/kl": 37.410308837890625, + "objective/non_score_reward": -1.8705153465270996, + "objective/rlhf_reward": -7.48206102848053, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3578522205352783, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8046875, + "step": 1388, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9993131160736084 + }, + { + "episode": 22240, + "epoch": 0.13325184839007323, + "loss/policy_avg": 0.12146013230085373, + "lr": 9.112346625766872e-06, + "objective/entropy": -210.94613647460938, + "objective/kl": 36.31158447265625, + "objective/non_score_reward": -1.8155791759490967, + "objective/rlhf_reward": -9.262317657470703, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.93437385559082, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7734375, + "step": 1389, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0063793659210205 + }, + { + "episode": 22256, + "epoch": 0.13334771302920276, + "loss/policy_avg": 0.07935690134763718, + "lr": 9.111707566462168e-06, + "objective/entropy": -78.06504821777344, + "objective/kl": 24.804061889648438, + "objective/non_score_reward": -1.2402031421661377, + "objective/rlhf_reward": -3.4045530845790655, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 35.813602447509766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.890625, + "step": 1390, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986217021942139 + }, + { + "episode": 22272, + "epoch": 0.1334435776683323, + "loss/policy_avg": -0.11746858060359955, + "lr": 9.111068507157464e-06, + "objective/entropy": -205.5814208984375, + "objective/kl": 39.45323181152344, + "objective/non_score_reward": -1.9726617336273193, + "objective/rlhf_reward": -7.890646874904633, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6865644454956055, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.642578125, + "step": 1391, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001758098602295 + }, + { + "episode": 22288, + "epoch": 0.13353944230746187, + "loss/policy_avg": 0.02275949716567993, + "lr": 9.110429447852761e-06, + "objective/entropy": -170.2174835205078, + "objective/kl": 47.165924072265625, + "objective/non_score_reward": -2.3582963943481445, + "objective/rlhf_reward": -7.952232244427561, + "objective/scores": 0.3702381544273198, + "policy/approxkl_avg": 9.971677780151367, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.787109375, + "step": 1392, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981083869934082 + }, + { + "episode": 22304, + "epoch": 0.13363530694659143, + "loss/policy_avg": 0.6221246123313904, + "lr": 9.109790388548058e-06, + "objective/entropy": -209.15390014648438, + "objective/kl": 47.805728912353516, + "objective/non_score_reward": -2.390286445617676, + "objective/rlhf_reward": -8.182543852416378, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 11.362401962280273, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.671875, + "step": 1393, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982948303222656 + }, + { + "episode": 22320, + "epoch": 0.13373117158572095, + "loss/policy_avg": 0.7497224807739258, + "lr": 9.109151329243355e-06, + "objective/entropy": -214.89273071289062, + "objective/kl": 40.493778228759766, + "objective/non_score_reward": -2.024688959121704, + "objective/rlhf_reward": -8.098755717277527, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1780507564544678, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.791015625, + "step": 1394, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.000169515609741 + }, + { + "episode": 22336, + "epoch": 0.1338270362248505, + "loss/policy_avg": 0.37489771842956543, + "lr": 9.10851226993865e-06, + "objective/entropy": -183.09951782226562, + "objective/kl": 37.789222717285156, + "objective/non_score_reward": -1.8894612789154053, + "objective/rlhf_reward": -6.001585571971491, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.5605037212371826, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53125, + "step": 1395, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9987205266952515 + }, + { + "episode": 22352, + "epoch": 0.13392290086398007, + "loss/policy_avg": 0.4532851576805115, + "lr": 9.107873210633947e-06, + "objective/entropy": -168.13888549804688, + "objective/kl": 33.63282012939453, + "objective/non_score_reward": -1.6816411018371582, + "objective/rlhf_reward": -4.9932313124338785, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 3.2246198654174805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.900390625, + "step": 1396, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991194009780884 + }, + { + "episode": 22368, + "epoch": 0.13401876550310962, + "loss/policy_avg": -0.1980736255645752, + "lr": 9.107234151329244e-06, + "objective/entropy": -191.61257934570312, + "objective/kl": 37.054786682128906, + "objective/non_score_reward": -1.8527392148971558, + "objective/rlhf_reward": -5.74909711402713, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 70.29098510742188, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.482421875, + "step": 1397, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001042127609253 + }, + { + "episode": 22384, + "epoch": 0.13411463014223915, + "loss/policy_avg": 0.04644312709569931, + "lr": 9.10659509202454e-06, + "objective/entropy": -205.40386962890625, + "objective/kl": 27.17392349243164, + "objective/non_score_reward": -1.3586962223052979, + "objective/rlhf_reward": -7.434784412384033, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.466697692871094, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.802734375, + "step": 1398, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0002012252807617 + }, + { + "episode": 22400, + "epoch": 0.1342104947813687, + "loss/policy_avg": 0.7228728532791138, + "lr": 9.105956032719838e-06, + "objective/entropy": -178.23971557617188, + "objective/kl": 32.924461364746094, + "objective/non_score_reward": -1.6462230682373047, + "objective/rlhf_reward": -8.584892272949219, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.7802796363830566, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.634765625, + "step": 1399, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998124361038208 + }, + { + "episode": 22416, + "epoch": 0.13430635942049826, + "loss/policy_avg": 0.6789200901985168, + "lr": 9.105316973415133e-06, + "objective/entropy": -238.48046875, + "objective/kl": 26.855918884277344, + "objective/non_score_reward": -1.3427958488464355, + "objective/rlhf_reward": -5.371183633804321, + "objective/scores": 0.0, + "policy/approxkl_avg": 24.61361312866211, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.693359375, + "step": 1400, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.003964424133301 + }, + { + "episode": 22432, + "epoch": 0.13440222405962782, + "loss/policy_avg": 0.06431432068347931, + "lr": 9.10467791411043e-06, + "objective/entropy": -184.9761199951172, + "objective/kl": 34.157752990722656, + "objective/non_score_reward": -1.7078876495361328, + "objective/rlhf_reward": -6.831550419330597, + "objective/scores": 0.0, + "policy/approxkl_avg": 79.0606460571289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.732421875, + "step": 1401, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9996626377105713 + }, + { + "episode": 22448, + "epoch": 0.13449808869875735, + "loss/policy_avg": 0.23548823595046997, + "lr": 9.104038854805727e-06, + "objective/entropy": -171.7328338623047, + "objective/kl": 30.062530517578125, + "objective/non_score_reward": -1.5031263828277588, + "objective/rlhf_reward": -4.496733867915806, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 11.886770248413086, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.71875, + "step": 1402, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9983357191085815 + }, + { + "episode": 22464, + "epoch": 0.1345939533378869, + "loss/policy_avg": 0.10480177402496338, + "lr": 9.103399795501024e-06, + "objective/entropy": -196.30435180664062, + "objective/kl": 36.58341979980469, + "objective/non_score_reward": -1.8291711807250977, + "objective/rlhf_reward": -9.31668472290039, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.4910566806793213, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.638671875, + "step": 1403, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0004119873046875 + }, + { + "episode": 22480, + "epoch": 0.13468981797701646, + "loss/policy_avg": 0.13037532567977905, + "lr": 9.10276073619632e-06, + "objective/entropy": -227.99850463867188, + "objective/kl": 39.84490203857422, + "objective/non_score_reward": -1.992245078086853, + "objective/rlhf_reward": -6.307120805204498, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 0.9137977361679077, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.62890625, + "step": 1404, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.005516529083252 + }, + { + "episode": 22496, + "epoch": 0.134785682616146, + "loss/policy_avg": 0.9584113359451294, + "lr": 9.102121676891617e-06, + "objective/entropy": -243.3638153076172, + "objective/kl": 34.329063415527344, + "objective/non_score_reward": -1.7164533138275146, + "objective/rlhf_reward": -5.309553830829218, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 53.12579345703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.755859375, + "step": 1405, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993245601654053 + }, + { + "episode": 22512, + "epoch": 0.13488154725527554, + "loss/policy_avg": 0.1008148044347763, + "lr": 9.101482617586912e-06, + "objective/entropy": -194.2659454345703, + "objective/kl": 26.940235137939453, + "objective/non_score_reward": -1.3470118045806885, + "objective/rlhf_reward": -7.388047218322754, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.30666732788086, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.857421875, + "step": 1406, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0005905628204346 + }, + { + "episode": 22528, + "epoch": 0.1349774118944051, + "loss/policy_avg": 0.575897216796875, + "lr": 9.10084355828221e-06, + "objective/entropy": -225.23428344726562, + "objective/kl": 40.34715270996094, + "objective/non_score_reward": -2.01735782623291, + "objective/rlhf_reward": -10.06943130493164, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.475804328918457, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.82421875, + "step": 1407, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.997112512588501 + }, + { + "episode": 22544, + "epoch": 0.13507327653353465, + "loss/policy_avg": 0.02817249298095703, + "lr": 9.100204498977506e-06, + "objective/entropy": -192.60028076171875, + "objective/kl": 34.591224670410156, + "objective/non_score_reward": -1.7295610904693604, + "objective/rlhf_reward": -8.918244361877441, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.901981353759766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.587890625, + "step": 1408, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986708164215088 + }, + { + "episode": 22560, + "epoch": 0.1351691411726642, + "loss/policy_avg": 0.13303323090076447, + "lr": 9.099565439672803e-06, + "objective/entropy": -223.80252075195312, + "objective/kl": 31.527381896972656, + "objective/non_score_reward": -1.576369047164917, + "objective/rlhf_reward": -8.305476188659668, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.19985294342041, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.806640625, + "step": 1409, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9999914169311523 + }, + { + "episode": 22576, + "epoch": 0.13526500581179374, + "loss/policy_avg": 0.043945979326963425, + "lr": 9.0989263803681e-06, + "objective/entropy": -134.9818572998047, + "objective/kl": 38.42216491699219, + "objective/non_score_reward": -1.9211082458496094, + "objective/rlhf_reward": -9.684432983398438, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.830410480499268, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.724609375, + "step": 1410, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989550113677979 + }, + { + "episode": 22592, + "epoch": 0.1353608704509233, + "loss/policy_avg": 0.0696876272559166, + "lr": 9.098287321063395e-06, + "objective/entropy": -84.12158203125, + "objective/kl": 33.16938781738281, + "objective/non_score_reward": -1.658469319343567, + "objective/rlhf_reward": -6.633877277374268, + "objective/scores": 0.0, + "policy/approxkl_avg": 39.198543548583984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.73046875, + "step": 1411, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.000175714492798 + }, + { + "episode": 22608, + "epoch": 0.13545673509005285, + "loss/policy_avg": 0.5900977849960327, + "lr": 9.097648261758692e-06, + "objective/entropy": -169.15768432617188, + "objective/kl": 44.29988098144531, + "objective/non_score_reward": -2.214994430541992, + "objective/rlhf_reward": -10.859977722167969, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.046003341674805, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.919921875, + "step": 1412, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9988155364990234 + }, + { + "episode": 22624, + "epoch": 0.1355525997291824, + "loss/policy_avg": 0.02379985898733139, + "lr": 9.097009202453987e-06, + "objective/entropy": -192.1562042236328, + "objective/kl": 33.012725830078125, + "objective/non_score_reward": -1.6506361961364746, + "objective/rlhf_reward": -6.6025450229644775, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.549930572509766, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.544921875, + "step": 1413, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002074241638184 + }, + { + "episode": 22640, + "epoch": 0.13564846436831193, + "loss/policy_avg": 0.11564032733440399, + "lr": 9.096370143149284e-06, + "objective/entropy": -166.14932250976562, + "objective/kl": 35.863372802734375, + "objective/non_score_reward": -1.7931684255599976, + "objective/rlhf_reward": -2.772673940658569, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.785583257675171, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.783203125, + "step": 1414, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.000230550765991 + }, + { + "episode": 22656, + "epoch": 0.1357443290074415, + "loss/policy_avg": 0.35586920380592346, + "lr": 9.095731083844581e-06, + "objective/entropy": -231.77088928222656, + "objective/kl": 49.181907653808594, + "objective/non_score_reward": -2.4590954780578613, + "objective/rlhf_reward": -11.836381912231445, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.9087586402893066, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.79296875, + "step": 1415, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0006589889526367 + }, + { + "episode": 22672, + "epoch": 0.13584019364657104, + "loss/policy_avg": 0.3709731698036194, + "lr": 9.095092024539878e-06, + "objective/entropy": -250.41647338867188, + "objective/kl": 39.163543701171875, + "objective/non_score_reward": -1.958177089691162, + "objective/rlhf_reward": -7.832708716392517, + "objective/scores": 0.0, + "policy/approxkl_avg": 14.966727256774902, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.732421875, + "step": 1416, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9999268054962158 + }, + { + "episode": 22688, + "epoch": 0.1359360582857006, + "loss/policy_avg": -0.11340951919555664, + "lr": 9.094452965235175e-06, + "objective/entropy": -220.16326904296875, + "objective/kl": 32.15831756591797, + "objective/non_score_reward": -1.6079161167144775, + "objective/rlhf_reward": -4.031664228439331, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.9896496534347534, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7578125, + "step": 1417, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0187859535217285 + }, + { + "episode": 22704, + "epoch": 0.13603192292483013, + "loss/policy_avg": -0.019560977816581726, + "lr": 9.093813905930472e-06, + "objective/entropy": -147.41273498535156, + "objective/kl": 30.50977897644043, + "objective/non_score_reward": -1.5254889726638794, + "objective/rlhf_reward": -8.10195541381836, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.4244065284729, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.775390625, + "step": 1418, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9999558925628662 + }, + { + "episode": 22720, + "epoch": 0.13612778756395968, + "loss/policy_avg": 0.11234011501073837, + "lr": 9.093174846625767e-06, + "objective/entropy": -271.77337646484375, + "objective/kl": 35.128662109375, + "objective/non_score_reward": -1.756433129310608, + "objective/rlhf_reward": -5.509960853847202, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 13.865474700927734, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.830078125, + "step": 1419, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9976922273635864 + }, + { + "episode": 22736, + "epoch": 0.13622365220308924, + "loss/policy_avg": 0.099049873650074, + "lr": 9.092535787321064e-06, + "objective/entropy": -221.79019165039062, + "objective/kl": 30.804927825927734, + "objective/non_score_reward": -1.5402464866638184, + "objective/rlhf_reward": -6.160986244678497, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.9403247833251953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.646484375, + "step": 1420, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988830089569092 + }, + { + "episode": 22752, + "epoch": 0.1363195168422188, + "loss/policy_avg": -0.08518800884485245, + "lr": 9.09189672801636e-06, + "objective/entropy": -182.5849609375, + "objective/kl": 32.34912109375, + "objective/non_score_reward": -1.6174559593200684, + "objective/rlhf_reward": -4.069823956489563, + "objective/scores": 0.6, + "policy/approxkl_avg": 54.8131217956543, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6875, + "step": 1421, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993408918380737 + }, + { + "episode": 22768, + "epoch": 0.13641538148134832, + "loss/policy_avg": 0.4391246438026428, + "lr": 9.091257668711657e-06, + "objective/entropy": -181.03689575195312, + "objective/kl": 39.99858856201172, + "objective/non_score_reward": -1.9999295473098755, + "objective/rlhf_reward": -7.999718070030212, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.63614559173584, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.654296875, + "step": 1422, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.999501347541809 + }, + { + "episode": 22784, + "epoch": 0.13651124612047788, + "loss/policy_avg": 0.19108720123767853, + "lr": 9.090618609406954e-06, + "objective/entropy": -122.43475341796875, + "objective/kl": 32.97454833984375, + "objective/non_score_reward": -1.6487276554107666, + "objective/rlhf_reward": -8.594910621643066, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.676631927490234, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.681640625, + "step": 1423, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0005125999450684 + }, + { + "episode": 22800, + "epoch": 0.13660711075960744, + "loss/policy_avg": 0.2035399079322815, + "lr": 9.08997955010225e-06, + "objective/entropy": -163.61480712890625, + "objective/kl": 36.999427795410156, + "objective/non_score_reward": -1.8499714136123657, + "objective/rlhf_reward": -6.074372921019716, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 2.5806355476379395, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.55078125, + "step": 1424, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9996778964996338 + }, + { + "episode": 22816, + "epoch": 0.136702975398737, + "loss/policy_avg": 0.5563037991523743, + "lr": 9.089340490797546e-06, + "objective/entropy": -202.767578125, + "objective/kl": 42.86833190917969, + "objective/non_score_reward": -2.1434166431427, + "objective/rlhf_reward": -8.57366669178009, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.489102840423584, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.771484375, + "step": 1425, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0018134117126465 + }, + { + "episode": 22832, + "epoch": 0.13679884003786652, + "loss/policy_avg": -0.1872452199459076, + "lr": 9.088701431492843e-06, + "objective/entropy": -144.49172973632812, + "objective/kl": 38.54411697387695, + "objective/non_score_reward": -1.9272058010101318, + "objective/rlhf_reward": -9.708823204040527, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.001338958740234, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6328125, + "step": 1426, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0178823471069336 + }, + { + "episode": 22848, + "epoch": 0.13689470467699608, + "loss/policy_avg": 0.0774359256029129, + "lr": 9.08806237218814e-06, + "objective/entropy": -233.4342498779297, + "objective/kl": 44.11488723754883, + "objective/non_score_reward": -2.2057442665100098, + "objective/rlhf_reward": -8.822977304458618, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.1445496082305908, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 1427, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.000418186187744 + }, + { + "episode": 22864, + "epoch": 0.13699056931612563, + "loss/policy_avg": 0.39956003427505493, + "lr": 9.087423312883437e-06, + "objective/entropy": -84.71013641357422, + "objective/kl": 46.03721618652344, + "objective/non_score_reward": -2.301860809326172, + "objective/rlhf_reward": -7.865806987791686, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 3.3770084381103516, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.53125, + "step": 1428, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9996185302734375 + }, + { + "episode": 22880, + "epoch": 0.1370864339552552, + "loss/policy_avg": 0.012207206338644028, + "lr": 9.086784253578734e-06, + "objective/entropy": -80.51762390136719, + "objective/kl": 33.242164611816406, + "objective/non_score_reward": -1.6621081829071045, + "objective/rlhf_reward": -5.132661068233189, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 28.132904052734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.640625, + "step": 1429, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0003459453582764 + }, + { + "episode": 22896, + "epoch": 0.13718229859438472, + "loss/policy_avg": 0.03659982234239578, + "lr": 9.086145194274029e-06, + "objective/entropy": -263.95465087890625, + "objective/kl": 34.276649475097656, + "objective/non_score_reward": -1.7138323783874512, + "objective/rlhf_reward": -8.855329513549805, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.828452110290527, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6328125, + "step": 1430, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993278980255127 + }, + { + "episode": 22912, + "epoch": 0.13727816323351427, + "loss/policy_avg": 0.20895805954933167, + "lr": 9.085506134969326e-06, + "objective/entropy": -164.40029907226562, + "objective/kl": 28.611112594604492, + "objective/non_score_reward": -1.4305555820465088, + "objective/rlhf_reward": -4.165963082519129, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.4341490268707275, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.689453125, + "step": 1431, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999100685119629 + }, + { + "episode": 22928, + "epoch": 0.13737402787264383, + "loss/policy_avg": 0.24767997860908508, + "lr": 9.084867075664623e-06, + "objective/entropy": -176.92132568359375, + "objective/kl": 33.8932991027832, + "objective/non_score_reward": -1.6946650743484497, + "objective/rlhf_reward": -8.77865982055664, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.446657180786133, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 1432, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9946794509887695 + }, + { + "episode": 22944, + "epoch": 0.13746989251177338, + "loss/policy_avg": 0.2693820595741272, + "lr": 9.08422801635992e-06, + "objective/entropy": -210.73587036132812, + "objective/kl": 33.37498474121094, + "objective/non_score_reward": -1.668749213218689, + "objective/rlhf_reward": -4.552290620581184, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 0.9048199653625488, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.623046875, + "step": 1433, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.002692222595215 + }, + { + "episode": 22960, + "epoch": 0.1375657571509029, + "loss/policy_avg": 0.1713779866695404, + "lr": 9.083588957055215e-06, + "objective/entropy": -220.00888061523438, + "objective/kl": 32.15864562988281, + "objective/non_score_reward": -1.6079323291778564, + "objective/rlhf_reward": -6.431729555130005, + "objective/scores": 0.0, + "policy/approxkl_avg": 14.331047058105469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7265625, + "step": 1434, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9982788562774658 + }, + { + "episode": 22976, + "epoch": 0.13766162179003247, + "loss/policy_avg": 0.25374528765678406, + "lr": 9.082949897750512e-06, + "objective/entropy": -1.4378490447998047, + "objective/kl": 40.159549713134766, + "objective/non_score_reward": -2.0079774856567383, + "objective/rlhf_reward": -6.72122918625772, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 9.500734329223633, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.609375, + "step": 1435, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998857021331787 + }, + { + "episode": 22992, + "epoch": 0.13775748642916202, + "loss/policy_avg": 0.49725142121315, + "lr": 9.082310838445809e-06, + "objective/entropy": -118.1282730102539, + "objective/kl": 30.619491577148438, + "objective/non_score_reward": -1.5309746265411377, + "objective/rlhf_reward": -8.12389850616455, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.352041244506836, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.828125, + "step": 1436, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988855123519897 + }, + { + "episode": 23008, + "epoch": 0.13785335106829158, + "loss/policy_avg": 0.15841074287891388, + "lr": 9.081671779141104e-06, + "objective/entropy": -211.70982360839844, + "objective/kl": 40.90330505371094, + "objective/non_score_reward": -2.045165538787842, + "objective/rlhf_reward": -10.180662155151367, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.64789962768555, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.650390625, + "step": 1437, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982593059539795 + }, + { + "episode": 23024, + "epoch": 0.1379492157074211, + "loss/policy_avg": 0.32557016611099243, + "lr": 9.0810327198364e-06, + "objective/entropy": -201.66539001464844, + "objective/kl": 31.07311248779297, + "objective/non_score_reward": -1.5536556243896484, + "objective/rlhf_reward": -4.610502395693379, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 19.387422561645508, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.814453125, + "step": 1438, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9976288080215454 + }, + { + "episode": 23040, + "epoch": 0.13804508034655066, + "loss/policy_avg": -0.03757210075855255, + "lr": 9.080393660531698e-06, + "objective/entropy": -158.59872436523438, + "objective/kl": 40.712257385253906, + "objective/non_score_reward": -2.0356125831604004, + "objective/rlhf_reward": -3.742451167106628, + "objective/scores": 1.1, + "policy/approxkl_avg": 30.52703094482422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5546875, + "step": 1439, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0008606910705566 + }, + { + "episode": 23056, + "epoch": 0.13814094498568022, + "loss/policy_avg": -0.03759051859378815, + "lr": 9.079754601226994e-06, + "objective/entropy": -229.6117401123047, + "objective/kl": 31.384471893310547, + "objective/non_score_reward": -1.5692236423492432, + "objective/rlhf_reward": -6.2768948674201965, + "objective/scores": 0.0, + "policy/approxkl_avg": 22.246253967285156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.701171875, + "step": 1440, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001797676086426 + }, + { + "episode": 23072, + "epoch": 0.13823680962480978, + "loss/policy_avg": -0.011252786964178085, + "lr": 9.079115541922291e-06, + "objective/entropy": -188.2359619140625, + "objective/kl": 30.88317108154297, + "objective/non_score_reward": -1.544158697128296, + "objective/rlhf_reward": -6.176634788513184, + "objective/scores": 0.0, + "policy/approxkl_avg": 56.69608688354492, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.60546875, + "step": 1441, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0000128746032715 + }, + { + "episode": 23088, + "epoch": 0.13833267426393933, + "loss/policy_avg": 0.1684042513370514, + "lr": 9.078476482617588e-06, + "objective/entropy": -212.06155395507812, + "objective/kl": 33.483802795410156, + "objective/non_score_reward": -1.6741902828216553, + "objective/rlhf_reward": -6.696760892868042, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.0115039348602295, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62890625, + "step": 1442, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000898838043213 + }, + { + "episode": 23104, + "epoch": 0.13842853890306886, + "loss/policy_avg": 0.015328019857406616, + "lr": 9.077837423312883e-06, + "objective/entropy": -68.81130981445312, + "objective/kl": 49.40843963623047, + "objective/non_score_reward": -2.4704220294952393, + "objective/rlhf_reward": -5.481688117980957, + "objective/scores": 1.1, + "policy/approxkl_avg": 45.8652458190918, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6875, + "step": 1443, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000455617904663 + }, + { + "episode": 23120, + "epoch": 0.13852440354219842, + "loss/policy_avg": -0.01319362223148346, + "lr": 9.07719836400818e-06, + "objective/entropy": -205.28091430664062, + "objective/kl": 33.12495040893555, + "objective/non_score_reward": -1.6562474966049194, + "objective/rlhf_reward": -8.624990463256836, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.4587922096252441, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.55859375, + "step": 1444, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0040221214294434 + }, + { + "episode": 23136, + "epoch": 0.13862026818132797, + "loss/policy_avg": -0.13566766679286957, + "lr": 9.076559304703477e-06, + "objective/entropy": -168.12368774414062, + "objective/kl": 35.132469177246094, + "objective/non_score_reward": -1.7566235065460205, + "objective/rlhf_reward": -9.026494026184082, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.900660753250122, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.79296875, + "step": 1445, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0029544830322266 + }, + { + "episode": 23152, + "epoch": 0.13871613282045753, + "loss/policy_avg": -0.10704624652862549, + "lr": 9.075920245398774e-06, + "objective/entropy": -150.61312866210938, + "objective/kl": 42.026371002197266, + "objective/non_score_reward": -2.101318359375, + "objective/rlhf_reward": -7.046023809646053, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 10.10629940032959, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.830078125, + "step": 1446, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9992228746414185 + }, + { + "episode": 23168, + "epoch": 0.13881199745958706, + "loss/policy_avg": 0.5887693166732788, + "lr": 9.075281186094071e-06, + "objective/entropy": -159.28955078125, + "objective/kl": 39.38755798339844, + "objective/non_score_reward": -1.969377875328064, + "objective/rlhf_reward": -7.877511501312256, + "objective/scores": 0.0, + "policy/approxkl_avg": 35.62648010253906, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.591796875, + "step": 1447, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980779886245728 + }, + { + "episode": 23184, + "epoch": 0.1389078620987166, + "loss/policy_avg": 0.1427895575761795, + "lr": 9.074642126789366e-06, + "objective/entropy": -246.7955322265625, + "objective/kl": 43.705169677734375, + "objective/non_score_reward": -2.185258388519287, + "objective/rlhf_reward": -4.3410329580307, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.562838077545166, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.67578125, + "step": 1448, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9986655712127686 + }, + { + "episode": 23200, + "epoch": 0.13900372673784617, + "loss/policy_avg": 0.23600831627845764, + "lr": 9.074003067484663e-06, + "objective/entropy": 46.658721923828125, + "objective/kl": 42.16393280029297, + "objective/non_score_reward": -2.10819673538208, + "objective/rlhf_reward": -7.122105708321929, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 1.271384596824646, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.634765625, + "step": 1449, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001964569091797 + }, + { + "episode": 23216, + "epoch": 0.13909959137697572, + "loss/policy_avg": 0.32459360361099243, + "lr": 9.07336400817996e-06, + "objective/entropy": -159.74755859375, + "objective/kl": 37.46391677856445, + "objective/non_score_reward": -1.873195767402649, + "objective/rlhf_reward": -5.370076837317024, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 3.1252617835998535, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.720703125, + "step": 1450, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0047178268432617 + }, + { + "episode": 23232, + "epoch": 0.13919545601610525, + "loss/policy_avg": 0.10729147493839264, + "lr": 9.072724948875257e-06, + "objective/entropy": -232.62164306640625, + "objective/kl": 30.652565002441406, + "objective/non_score_reward": -1.5326282978057861, + "objective/rlhf_reward": -8.130512237548828, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.1043455600738525, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.59765625, + "step": 1451, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989426136016846 + }, + { + "episode": 23248, + "epoch": 0.1392913206552348, + "loss/policy_avg": -0.03566136211156845, + "lr": 9.072085889570554e-06, + "objective/entropy": -175.93356323242188, + "objective/kl": 31.67029571533203, + "objective/non_score_reward": -1.583514928817749, + "objective/rlhf_reward": -5.023378839692474, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 5.333813667297363, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.865234375, + "step": 1452, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000720977783203 + }, + { + "episode": 23264, + "epoch": 0.13938718529436436, + "loss/policy_avg": 0.5625999569892883, + "lr": 9.07144683026585e-06, + "objective/entropy": -88.34669494628906, + "objective/kl": 47.17588806152344, + "objective/non_score_reward": -2.3587944507598877, + "objective/rlhf_reward": -9.43517780303955, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.584896087646484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.681640625, + "step": 1453, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998932957649231 + }, + { + "episode": 23280, + "epoch": 0.13948304993349392, + "loss/policy_avg": -0.2735084891319275, + "lr": 9.070807770961146e-06, + "objective/entropy": -164.4346160888672, + "objective/kl": 35.650638580322266, + "objective/non_score_reward": -1.782531976699829, + "objective/rlhf_reward": -7.130127787590027, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.297776222229004, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.69921875, + "step": 1454, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0009090900421143 + }, + { + "episode": 23296, + "epoch": 0.13957891457262345, + "loss/policy_avg": 0.18696127831935883, + "lr": 9.070168711656443e-06, + "objective/entropy": -161.61410522460938, + "objective/kl": 29.608388900756836, + "objective/non_score_reward": -1.4804195165634155, + "objective/rlhf_reward": -4.596165094405336, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 0.974159300327301, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.74609375, + "step": 1455, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0017507076263428 + }, + { + "episode": 23312, + "epoch": 0.139674779211753, + "loss/policy_avg": 0.2966254949569702, + "lr": 9.069529652351738e-06, + "objective/entropy": -226.92344665527344, + "objective/kl": 30.372020721435547, + "objective/non_score_reward": -1.5186010599136353, + "objective/rlhf_reward": -6.0744041204452515, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.384140968322754, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7421875, + "step": 1456, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0020666122436523 + }, + { + "episode": 23328, + "epoch": 0.13977064385088256, + "loss/policy_avg": -0.01494111493229866, + "lr": 9.068890593047035e-06, + "objective/entropy": -216.09146118164062, + "objective/kl": 36.78660202026367, + "objective/non_score_reward": -1.8393300771713257, + "objective/rlhf_reward": -7.357320189476013, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.2278923988342285, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.662109375, + "step": 1457, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0012807846069336 + }, + { + "episode": 23344, + "epoch": 0.13986650849001211, + "loss/policy_avg": 0.07382915169000626, + "lr": 9.068251533742332e-06, + "objective/entropy": -182.36463928222656, + "objective/kl": 37.20063018798828, + "objective/non_score_reward": -1.8600313663482666, + "objective/rlhf_reward": -7.440125584602356, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.324472904205322, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.72265625, + "step": 1458, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998921275138855 + }, + { + "episode": 23360, + "epoch": 0.13996237312914164, + "loss/policy_avg": 0.4102204442024231, + "lr": 9.067612474437628e-06, + "objective/entropy": -246.98814392089844, + "objective/kl": 40.265289306640625, + "objective/non_score_reward": -2.0132644176483154, + "objective/rlhf_reward": -8.05305790901184, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.245479583740234, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.787109375, + "step": 1459, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9975072145462036 + }, + { + "episode": 23376, + "epoch": 0.1400582377682712, + "loss/policy_avg": 0.19098971784114838, + "lr": 9.066973415132925e-06, + "objective/entropy": -188.01968383789062, + "objective/kl": 31.92349624633789, + "objective/non_score_reward": -1.596174955368042, + "objective/rlhf_reward": -6.384699463844299, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.8341217041015625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.67578125, + "step": 1460, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9966533184051514 + }, + { + "episode": 23392, + "epoch": 0.14015410240740075, + "loss/policy_avg": 0.8235700130462646, + "lr": 9.06633435582822e-06, + "objective/entropy": -183.0225830078125, + "objective/kl": 37.038665771484375, + "objective/non_score_reward": -1.8519333600997925, + "objective/rlhf_reward": -6.082220230132265, + "objective/scores": 0.3313782131597591, + "policy/approxkl_avg": 66.31561279296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6953125, + "step": 1461, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9981609582901 + }, + { + "episode": 23408, + "epoch": 0.1402499670465303, + "loss/policy_avg": 0.8242926001548767, + "lr": 9.065695296523517e-06, + "objective/entropy": -223.21212768554688, + "objective/kl": 27.327381134033203, + "objective/non_score_reward": -1.3663692474365234, + "objective/rlhf_reward": -3.5180655819939926, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.023781776428223, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.677734375, + "step": 1462, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0004143714904785 + }, + { + "episode": 23424, + "epoch": 0.14034583168565984, + "loss/policy_avg": -0.012389983981847763, + "lr": 9.065056237218814e-06, + "objective/entropy": -92.06814575195312, + "objective/kl": 40.54405212402344, + "objective/non_score_reward": -2.027202606201172, + "objective/rlhf_reward": -6.798129310807585, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 1.5758436918258667, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.708984375, + "step": 1463, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0014731884002686 + }, + { + "episode": 23440, + "epoch": 0.1404416963247894, + "loss/policy_avg": 0.004808776080608368, + "lr": 9.064417177914111e-06, + "objective/entropy": -163.3157501220703, + "objective/kl": 34.39897918701172, + "objective/non_score_reward": -1.7199490070343018, + "objective/rlhf_reward": -4.757090034262214, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 12.671178817749023, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6640625, + "step": 1464, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0005481243133545 + }, + { + "episode": 23456, + "epoch": 0.14053756096391895, + "loss/policy_avg": 0.947679877281189, + "lr": 9.063778118609408e-06, + "objective/entropy": -275.1947021484375, + "objective/kl": 51.65427017211914, + "objective/non_score_reward": -2.5827133655548096, + "objective/rlhf_reward": -8.880255322070465, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 4.93233585357666, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.70703125, + "step": 1465, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000178098678589 + }, + { + "episode": 23472, + "epoch": 0.1406334256030485, + "loss/policy_avg": 0.11190029978752136, + "lr": 9.063139059304705e-06, + "objective/entropy": -153.43563842773438, + "objective/kl": 36.64055633544922, + "objective/non_score_reward": -1.832027792930603, + "objective/rlhf_reward": -7.328111171722412, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8334834575653076, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.751953125, + "step": 1466, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.0003063678741455 + }, + { + "episode": 23488, + "epoch": 0.14072929024217803, + "loss/policy_avg": 1.0783703327178955, + "lr": 9.0625e-06, + "objective/entropy": -234.9717254638672, + "objective/kl": 36.277320861816406, + "objective/non_score_reward": -1.813866138458252, + "objective/rlhf_reward": -7.255464673042297, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.389228105545044, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.720703125, + "step": 1467, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0003976821899414 + }, + { + "episode": 23504, + "epoch": 0.1408251548813076, + "loss/policy_avg": 0.4315020442008972, + "lr": 9.061860940695297e-06, + "objective/entropy": -241.1446533203125, + "objective/kl": 38.00855255126953, + "objective/non_score_reward": -1.9004275798797607, + "objective/rlhf_reward": -9.601710319519043, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.890410900115967, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.630859375, + "step": 1468, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000802993774414 + }, + { + "episode": 23520, + "epoch": 0.14092101952043715, + "loss/policy_avg": 0.011334262788295746, + "lr": 9.061221881390594e-06, + "objective/entropy": -105.41771697998047, + "objective/kl": 28.207796096801758, + "objective/non_score_reward": -1.4103899002075195, + "objective/rlhf_reward": -1.2415596008300778, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.8694658279418945, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.78125, + "step": 1469, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999434232711792 + }, + { + "episode": 23536, + "epoch": 0.1410168841595667, + "loss/policy_avg": 0.12156292051076889, + "lr": 9.06058282208589e-06, + "objective/entropy": -177.39425659179688, + "objective/kl": 26.367612838745117, + "objective/non_score_reward": -1.31838059425354, + "objective/rlhf_reward": -3.9142727491602134, + "objective/scores": 0.33981246656813147, + "policy/approxkl_avg": 1.3104565143585205, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 1470, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0008339881896973 + }, + { + "episode": 23552, + "epoch": 0.14111274879869623, + "loss/policy_avg": 0.03381851315498352, + "lr": 9.059943762781188e-06, + "objective/entropy": -243.26577758789062, + "objective/kl": 47.82252502441406, + "objective/non_score_reward": -2.3911263942718506, + "objective/rlhf_reward": -8.22286968520227, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 1.0646920204162598, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.572265625, + "step": 1471, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001126766204834 + }, + { + "episode": 23568, + "epoch": 0.1412086134378258, + "loss/policy_avg": 0.26619529724121094, + "lr": 9.059304703476484e-06, + "objective/entropy": -235.33363342285156, + "objective/kl": 37.040279388427734, + "objective/non_score_reward": -1.852014183998108, + "objective/rlhf_reward": -7.4080564975738525, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6541755199432373, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7265625, + "step": 1472, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9983246326446533 + }, + { + "episode": 23584, + "epoch": 0.14130447807695534, + "loss/policy_avg": 0.7362419366836548, + "lr": 9.05866564417178e-06, + "objective/entropy": -259.7974853515625, + "objective/kl": 38.83033752441406, + "objective/non_score_reward": -1.9415171146392822, + "objective/rlhf_reward": -7.766068339347839, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.9125937819480896, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 1473, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001113176345825 + }, + { + "episode": 23600, + "epoch": 0.1414003427160849, + "loss/policy_avg": 0.15635953843593597, + "lr": 9.058026584867077e-06, + "objective/entropy": -134.94927978515625, + "objective/kl": 47.25566482543945, + "objective/non_score_reward": -2.362783432006836, + "objective/rlhf_reward": -9.451133787631989, + "objective/scores": 0.0, + "policy/approxkl_avg": 33.38326644897461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.740234375, + "step": 1474, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.997588872909546 + }, + { + "episode": 23616, + "epoch": 0.14149620735521443, + "loss/policy_avg": 0.3056976795196533, + "lr": 9.057387525562373e-06, + "objective/entropy": -215.6920928955078, + "objective/kl": 34.730552673339844, + "objective/non_score_reward": -1.736527442932129, + "objective/rlhf_reward": -8.946109771728516, + "objective/scores": -0.5, + "policy/approxkl_avg": 16.363327026367188, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7578125, + "step": 1475, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999530553817749 + }, + { + "episode": 23632, + "epoch": 0.14159207199434398, + "loss/policy_avg": 0.3586670756340027, + "lr": 9.05674846625767e-06, + "objective/entropy": -162.2206573486328, + "objective/kl": 45.82000732421875, + "objective/non_score_reward": -2.2910006046295166, + "objective/rlhf_reward": -6.764002180099487, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.3060927391052246, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.591796875, + "step": 1476, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9998315572738647 + }, + { + "episode": 23648, + "epoch": 0.14168793663347354, + "loss/policy_avg": 0.08927397429943085, + "lr": 9.056109406952967e-06, + "objective/entropy": -211.142578125, + "objective/kl": 40.72630310058594, + "objective/non_score_reward": -2.0363149642944336, + "objective/rlhf_reward": -10.145259857177734, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.7020206451416, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.669921875, + "step": 1477, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9981908798217773 + }, + { + "episode": 23664, + "epoch": 0.1417838012726031, + "loss/policy_avg": 0.03727121651172638, + "lr": 9.055470347648262e-06, + "objective/entropy": -276.27386474609375, + "objective/kl": 32.27709197998047, + "objective/non_score_reward": -1.6138546466827393, + "objective/rlhf_reward": -8.45541763305664, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.986801147460938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.673828125, + "step": 1478, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0006332397460938 + }, + { + "episode": 23680, + "epoch": 0.14187966591173262, + "loss/policy_avg": -0.15662336349487305, + "lr": 9.05483128834356e-06, + "objective/entropy": -119.97581481933594, + "objective/kl": 36.79579162597656, + "objective/non_score_reward": -1.839789628982544, + "objective/rlhf_reward": -7.359158635139465, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8840548992156982, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.68359375, + "step": 1479, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0026803016662598 + }, + { + "episode": 23696, + "epoch": 0.14197553055086218, + "loss/policy_avg": 0.620514988899231, + "lr": 9.054192229038854e-06, + "objective/entropy": -268.1829833984375, + "objective/kl": 42.271705627441406, + "objective/non_score_reward": -2.1135849952697754, + "objective/rlhf_reward": -6.5069293481873824, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.6114516258239746, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6484375, + "step": 1480, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9983903169631958 + }, + { + "episode": 23712, + "epoch": 0.14207139518999173, + "loss/policy_avg": 0.15371036529541016, + "lr": 9.053553169734151e-06, + "objective/entropy": -244.808837890625, + "objective/kl": 30.902687072753906, + "objective/non_score_reward": -1.5451343059539795, + "objective/rlhf_reward": -4.355708475383829, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 1.3943830728530884, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.669921875, + "step": 1481, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9997410774230957 + }, + { + "episode": 23728, + "epoch": 0.1421672598291213, + "loss/policy_avg": 0.0902445912361145, + "lr": 9.052914110429448e-06, + "objective/entropy": -223.44686889648438, + "objective/kl": 29.38083267211914, + "objective/non_score_reward": -1.4690418243408203, + "objective/rlhf_reward": -5.876166999340057, + "objective/scores": 0.0, + "policy/approxkl_avg": 23.800241470336914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.634765625, + "step": 1482, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9976519346237183 + }, + { + "episode": 23744, + "epoch": 0.14226312446825082, + "loss/policy_avg": 0.27748212218284607, + "lr": 9.052275051124745e-06, + "objective/entropy": -198.6671142578125, + "objective/kl": 40.270076751708984, + "objective/non_score_reward": -2.0135040283203125, + "objective/rlhf_reward": -6.49775656959112, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 5.921305179595947, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.75390625, + "step": 1483, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987270832061768 + }, + { + "episode": 23760, + "epoch": 0.14235898910738037, + "loss/policy_avg": -0.06764909625053406, + "lr": 9.051635991820042e-06, + "objective/entropy": -243.7007293701172, + "objective/kl": 35.429527282714844, + "objective/non_score_reward": -1.7714765071868896, + "objective/rlhf_reward": -9.085906028747559, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.847067832946777, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.724609375, + "step": 1484, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000624179840088 + }, + { + "episode": 23776, + "epoch": 0.14245485374650993, + "loss/policy_avg": 0.35366156697273254, + "lr": 9.050996932515339e-06, + "objective/entropy": -190.80877685546875, + "objective/kl": 48.02167510986328, + "objective/non_score_reward": -2.4010837078094482, + "objective/rlhf_reward": -9.604334831237793, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.951902389526367, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.669921875, + "step": 1485, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994319677352905 + }, + { + "episode": 23792, + "epoch": 0.14255071838563949, + "loss/policy_avg": 0.6607328653335571, + "lr": 9.050357873210634e-06, + "objective/entropy": -231.9893798828125, + "objective/kl": 35.79993438720703, + "objective/non_score_reward": -1.789996862411499, + "objective/rlhf_reward": -7.159987449645996, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.2611920833587646, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.662109375, + "step": 1486, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.00154709815979 + }, + { + "episode": 23808, + "epoch": 0.142646583024769, + "loss/policy_avg": 0.13746711611747742, + "lr": 9.049718813905931e-06, + "objective/entropy": -221.01318359375, + "objective/kl": 34.20423126220703, + "objective/non_score_reward": -1.7102115154266357, + "objective/rlhf_reward": -6.840846002101898, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.62340784072876, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.75390625, + "step": 1487, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001349449157715 + }, + { + "episode": 23824, + "epoch": 0.14274244766389857, + "loss/policy_avg": 0.03470245376229286, + "lr": 9.049079754601228e-06, + "objective/entropy": -271.19476318359375, + "objective/kl": 43.550689697265625, + "objective/non_score_reward": -2.177534580230713, + "objective/rlhf_reward": -8.710138082504272, + "objective/scores": 0.0, + "policy/approxkl_avg": 58.72370147705078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.794921875, + "step": 1488, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9994006156921387 + }, + { + "episode": 23840, + "epoch": 0.14283831230302813, + "loss/policy_avg": 0.4665597081184387, + "lr": 9.048440695296525e-06, + "objective/entropy": -223.2164306640625, + "objective/kl": 25.610610961914062, + "objective/non_score_reward": -1.2805306911468506, + "objective/rlhf_reward": -5.1221224665641785, + "objective/scores": 0.0, + "policy/approxkl_avg": 35.59868240356445, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.599609375, + "step": 1489, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999636173248291 + }, + { + "episode": 23856, + "epoch": 0.14293417694215768, + "loss/policy_avg": 0.1881203055381775, + "lr": 9.047801635991821e-06, + "objective/entropy": -231.2401580810547, + "objective/kl": 25.03434944152832, + "objective/non_score_reward": -1.2517175674438477, + "objective/rlhf_reward": -3.5562718912080378, + "objective/scores": 0.36264953503719355, + "policy/approxkl_avg": 52.73191833496094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.603515625, + "step": 1490, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.997032880783081 + }, + { + "episode": 23872, + "epoch": 0.1430300415812872, + "loss/policy_avg": 0.20446205139160156, + "lr": 9.047162576687117e-06, + "objective/entropy": -221.01141357421875, + "objective/kl": 34.63885498046875, + "objective/non_score_reward": -1.7319427728652954, + "objective/rlhf_reward": -8.927770614624023, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.67100524902344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.869140625, + "step": 1491, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.00142765045166 + }, + { + "episode": 23888, + "epoch": 0.14312590622041677, + "loss/policy_avg": 0.027562838047742844, + "lr": 9.046523517382414e-06, + "objective/entropy": -161.34091186523438, + "objective/kl": 32.6038932800293, + "objective/non_score_reward": -1.6301947832107544, + "objective/rlhf_reward": -6.5207788944244385, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.0126057863235474, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.712890625, + "step": 1492, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999809265136719 + }, + { + "episode": 23904, + "epoch": 0.14322177085954632, + "loss/policy_avg": 0.24607491493225098, + "lr": 9.04588445807771e-06, + "objective/entropy": -263.24951171875, + "objective/kl": 33.5936393737793, + "objective/non_score_reward": -1.6796820163726807, + "objective/rlhf_reward": -6.718727946281433, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.278523921966553, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.630859375, + "step": 1493, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9993181228637695 + }, + { + "episode": 23920, + "epoch": 0.14331763549867588, + "loss/policy_avg": 0.3695078492164612, + "lr": 9.045245398773007e-06, + "objective/entropy": -208.70309448242188, + "objective/kl": 27.421342849731445, + "objective/non_score_reward": -1.3710671663284302, + "objective/rlhf_reward": -7.484268665313721, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.885932922363281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.701171875, + "step": 1494, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999609351158142 + }, + { + "episode": 23936, + "epoch": 0.1434135001378054, + "loss/policy_avg": 0.14508160948753357, + "lr": 9.044606339468304e-06, + "objective/entropy": -216.826171875, + "objective/kl": 44.74562072753906, + "objective/non_score_reward": -2.23728084564209, + "objective/rlhf_reward": -8.949124217033386, + "objective/scores": 0.0, + "policy/approxkl_avg": 21.513931274414062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.84765625, + "step": 1495, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 2.000661611557007 + }, + { + "episode": 23952, + "epoch": 0.14350936477693496, + "loss/policy_avg": 0.05339039862155914, + "lr": 9.043967280163601e-06, + "objective/entropy": -235.76129150390625, + "objective/kl": 28.482507705688477, + "objective/non_score_reward": -1.4241254329681396, + "objective/rlhf_reward": -3.871672685417246, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 13.350640296936035, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.87890625, + "step": 1496, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.998685598373413 + }, + { + "episode": 23968, + "epoch": 0.14360522941606452, + "loss/policy_avg": 0.04982265084981918, + "lr": 9.043328220858896e-06, + "objective/entropy": -250.32717895507812, + "objective/kl": 38.19590759277344, + "objective/non_score_reward": -1.9097955226898193, + "objective/rlhf_reward": -7.639182090759277, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.951807022094727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.734375, + "step": 1497, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9975727796554565 + }, + { + "episode": 23984, + "epoch": 0.14370109405519407, + "loss/policy_avg": 0.6328533887863159, + "lr": 9.042689161554193e-06, + "objective/entropy": -148.8836669921875, + "objective/kl": 23.223995208740234, + "objective/non_score_reward": -1.161199688911438, + "objective/rlhf_reward": -4.644798755645752, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.808246612548828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.671875, + "step": 1498, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999420404434204 + }, + { + "episode": 24000, + "epoch": 0.14379695869432363, + "loss/policy_avg": 0.12657588720321655, + "lr": 9.04205010224949e-06, + "objective/entropy": -191.64862060546875, + "objective/kl": 31.65740966796875, + "objective/non_score_reward": -1.5828704833984375, + "objective/rlhf_reward": -6.33148193359375, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.026250839233398, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.728515625, + "step": 1499, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0003626346588135 + }, + { + "episode": 24016, + "epoch": 0.14389282333345316, + "loss/policy_avg": 0.4729980528354645, + "lr": 9.041411042944787e-06, + "objective/entropy": -227.8267822265625, + "objective/kl": 33.30803680419922, + "objective/non_score_reward": -1.6654019355773926, + "objective/rlhf_reward": -8.66160774230957, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.261730670928955, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.810546875, + "step": 1500, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0002188682556152 + }, + { + "episode": 24032, + "epoch": 0.1439886879725827, + "loss/policy_avg": 0.12815649807453156, + "lr": 9.040771983640082e-06, + "objective/entropy": -231.8274383544922, + "objective/kl": 42.42681884765625, + "objective/non_score_reward": -2.121340751647949, + "objective/rlhf_reward": -10.485363006591797, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.1228022575378418, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 1501, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0006580352783203 + }, + { + "episode": 24048, + "epoch": 0.14408455261171227, + "loss/policy_avg": -0.05794616788625717, + "lr": 9.040132924335379e-06, + "objective/entropy": -208.10671997070312, + "objective/kl": 41.5554313659668, + "objective/non_score_reward": -2.0777716636657715, + "objective/rlhf_reward": -8.311086416244507, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.234374523162842, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.716796875, + "step": 1502, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9994726181030273 + }, + { + "episode": 24064, + "epoch": 0.14418041725084182, + "loss/policy_avg": 0.16567739844322205, + "lr": 9.039493865030676e-06, + "objective/entropy": -234.3512725830078, + "objective/kl": 37.30360412597656, + "objective/non_score_reward": -1.8651800155639648, + "objective/rlhf_reward": -7.460720062255859, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.919219970703125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.703125, + "step": 1503, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.0018868446350098 + }, + { + "episode": 24080, + "epoch": 0.14427628188997135, + "loss/policy_avg": 1.0528833866119385, + "lr": 9.038854805725971e-06, + "objective/entropy": -152.41259765625, + "objective/kl": 35.38163757324219, + "objective/non_score_reward": -1.7690820693969727, + "objective/rlhf_reward": -2.6763280689716336, + "objective/scores": 1.1, + "policy/approxkl_avg": 0.9121379852294922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.80078125, + "step": 1504, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.00167179107666 + }, + { + "episode": 24096, + "epoch": 0.1443721465291009, + "loss/policy_avg": 0.6609749794006348, + "lr": 9.038215746421268e-06, + "objective/entropy": -273.45556640625, + "objective/kl": 33.85598373413086, + "objective/non_score_reward": -1.6927990913391113, + "objective/rlhf_reward": -4.8237853748368575, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 106.44645690917969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7578125, + "step": 1505, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9976800680160522 + }, + { + "episode": 24112, + "epoch": 0.14446801116823046, + "loss/policy_avg": 0.15931403636932373, + "lr": 9.037576687116565e-06, + "objective/entropy": -287.9610900878906, + "objective/kl": 33.58984375, + "objective/non_score_reward": -1.6794922351837158, + "objective/rlhf_reward": -6.717969179153442, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.7928564548492432, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7109375, + "step": 1506, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999690294265747 + }, + { + "episode": 24128, + "epoch": 0.14456387580736002, + "loss/policy_avg": 0.1707848161458969, + "lr": 9.036937627811862e-06, + "objective/entropy": -164.31118774414062, + "objective/kl": 35.66673278808594, + "objective/non_score_reward": -1.783336877822876, + "objective/rlhf_reward": -7.133347153663635, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3680481910705566, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6953125, + "step": 1507, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9976303577423096 + }, + { + "episode": 24144, + "epoch": 0.14465974044648955, + "loss/policy_avg": 0.19946977496147156, + "lr": 9.036298568507159e-06, + "objective/entropy": -291.9969177246094, + "objective/kl": 36.32465362548828, + "objective/non_score_reward": -1.8162329196929932, + "objective/rlhf_reward": -7.264931559562683, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.556414604187012, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.65234375, + "step": 1508, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989137649536133 + }, + { + "episode": 24160, + "epoch": 0.1447556050856191, + "loss/policy_avg": 0.07756930589675903, + "lr": 9.035659509202455e-06, + "objective/entropy": -270.1118469238281, + "objective/kl": 38.389747619628906, + "objective/non_score_reward": -1.919487476348877, + "objective/rlhf_reward": -6.0160907558804615, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 0.3965752124786377, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.654296875, + "step": 1509, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0007505416870117 + }, + { + "episode": 24176, + "epoch": 0.14485146972474866, + "loss/policy_avg": -0.08306282758712769, + "lr": 9.03502044989775e-06, + "objective/entropy": -189.57479858398438, + "objective/kl": 35.964874267578125, + "objective/non_score_reward": -1.798243761062622, + "objective/rlhf_reward": -9.192975044250488, + "objective/scores": -0.5, + "policy/approxkl_avg": 19.52414321899414, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.75390625, + "step": 1510, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0025320053100586 + }, + { + "episode": 24192, + "epoch": 0.14494733436387822, + "loss/policy_avg": 0.24003642797470093, + "lr": 9.034381390593047e-06, + "objective/entropy": -283.571533203125, + "objective/kl": 32.6296501159668, + "objective/non_score_reward": -1.6314826011657715, + "objective/rlhf_reward": -6.525930047035217, + "objective/scores": 0.0, + "policy/approxkl_avg": 41.62873077392578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.650390625, + "step": 1511, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9983458518981934 + }, + { + "episode": 24208, + "epoch": 0.14504319900300774, + "loss/policy_avg": -0.3894387483596802, + "lr": 9.033742331288344e-06, + "objective/entropy": -186.86566162109375, + "objective/kl": 39.883148193359375, + "objective/non_score_reward": -1.9941574335098267, + "objective/rlhf_reward": -6.460857951434788, + "objective/scores": 0.37894294565112985, + "policy/approxkl_avg": 43.1141357421875, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.78515625, + "step": 1512, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.000645637512207 + }, + { + "episode": 24224, + "epoch": 0.1451390636421373, + "loss/policy_avg": 0.14439237117767334, + "lr": 9.033103271983641e-06, + "objective/entropy": -199.6478271484375, + "objective/kl": 44.08855438232422, + "objective/non_score_reward": -2.204427719116211, + "objective/rlhf_reward": -4.417711234092712, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.147274017333984, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.95703125, + "step": 1513, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9965183734893799 + }, + { + "episode": 24240, + "epoch": 0.14523492828126686, + "loss/policy_avg": 0.3906131386756897, + "lr": 9.032464212678938e-06, + "objective/entropy": -247.7611083984375, + "objective/kl": 38.82661056518555, + "objective/non_score_reward": -1.9413304328918457, + "objective/rlhf_reward": -7.765321850776672, + "objective/scores": 0.0, + "policy/approxkl_avg": 33.52732467651367, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65625, + "step": 1514, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988460540771484 + }, + { + "episode": 24256, + "epoch": 0.1453307929203964, + "loss/policy_avg": -0.08846022188663483, + "lr": 9.031825153374233e-06, + "objective/entropy": -244.41131591796875, + "objective/kl": 22.494068145751953, + "objective/non_score_reward": -1.1247034072875977, + "objective/rlhf_reward": -4.498813331127167, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.7829079627990723, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.787109375, + "step": 1515, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001941204071045 + }, + { + "episode": 24272, + "epoch": 0.14542665755952594, + "loss/policy_avg": 0.029215388000011444, + "lr": 9.03118609406953e-06, + "objective/entropy": -164.27078247070312, + "objective/kl": 32.41437530517578, + "objective/non_score_reward": -1.6207189559936523, + "objective/rlhf_reward": -5.104273297873837, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 6.219906806945801, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.75, + "step": 1516, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999031901359558 + }, + { + "episode": 24288, + "epoch": 0.1455225221986555, + "loss/policy_avg": 0.32182592153549194, + "lr": 9.030547034764827e-06, + "objective/entropy": -232.5615234375, + "objective/kl": 40.81670379638672, + "objective/non_score_reward": -2.04083514213562, + "objective/rlhf_reward": -10.16334056854248, + "objective/scores": -0.5, + "policy/approxkl_avg": 44.28282165527344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7109375, + "step": 1517, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979705810546875 + }, + { + "episode": 24304, + "epoch": 0.14561838683778505, + "loss/policy_avg": 0.34956806898117065, + "lr": 9.029907975460124e-06, + "objective/entropy": -245.35293579101562, + "objective/kl": 40.005531311035156, + "objective/non_score_reward": -2.000276565551758, + "objective/rlhf_reward": -8.001105904579163, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.326057434082031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.861328125, + "step": 1518, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9990335702896118 + }, + { + "episode": 24320, + "epoch": 0.1457142514769146, + "loss/policy_avg": 0.14802329242229462, + "lr": 9.02926891615542e-06, + "objective/entropy": -252.0884246826172, + "objective/kl": 30.0086669921875, + "objective/non_score_reward": -1.5004332065582275, + "objective/rlhf_reward": -6.001733183860779, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.0877797603607178, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.767578125, + "step": 1519, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9981493949890137 + }, + { + "episode": 24336, + "epoch": 0.14581011611604414, + "loss/policy_avg": -0.16504064202308655, + "lr": 9.028629856850718e-06, + "objective/entropy": -201.4540557861328, + "objective/kl": 34.25769805908203, + "objective/non_score_reward": -1.7128849029541016, + "objective/rlhf_reward": -6.851539373397827, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.378384113311768, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.744140625, + "step": 1520, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9997289180755615 + }, + { + "episode": 24352, + "epoch": 0.1459059807551737, + "loss/policy_avg": -0.058056317269802094, + "lr": 9.027990797546013e-06, + "objective/entropy": -247.2266845703125, + "objective/kl": 26.50177574157715, + "objective/non_score_reward": -1.3250887393951416, + "objective/rlhf_reward": -5.300354838371277, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.478802680969238, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.775390625, + "step": 1521, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.0078001022338867 + }, + { + "episode": 24368, + "epoch": 0.14600184539430325, + "loss/policy_avg": 0.31527331471443176, + "lr": 9.02735173824131e-06, + "objective/entropy": -244.34706115722656, + "objective/kl": 20.598609924316406, + "objective/non_score_reward": -1.029930591583252, + "objective/rlhf_reward": -4.119722306728363, + "objective/scores": 0.0, + "policy/approxkl_avg": 13.034455299377441, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.736328125, + "step": 1522, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993641376495361 + }, + { + "episode": 24384, + "epoch": 0.1460977100334328, + "loss/policy_avg": 0.3819689154624939, + "lr": 9.026712678936605e-06, + "objective/entropy": -289.578125, + "objective/kl": 29.244293212890625, + "objective/non_score_reward": -1.462214708328247, + "objective/rlhf_reward": -7.848858833312988, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.830896377563477, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.787109375, + "step": 1523, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.001002311706543 + }, + { + "episode": 24400, + "epoch": 0.14619357467256233, + "loss/policy_avg": 0.1631467640399933, + "lr": 9.026073619631902e-06, + "objective/entropy": -172.63995361328125, + "objective/kl": 38.19480514526367, + "objective/non_score_reward": -1.9097402095794678, + "objective/rlhf_reward": -5.905627862612406, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 17.047449111938477, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.603515625, + "step": 1524, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9992611408233643 + }, + { + "episode": 24416, + "epoch": 0.1462894393116919, + "loss/policy_avg": 0.02400306612253189, + "lr": 9.025434560327199e-06, + "objective/entropy": -187.46742248535156, + "objective/kl": 33.727073669433594, + "objective/non_score_reward": -1.6863539218902588, + "objective/rlhf_reward": -6.745415568351746, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.8205434083938599, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 1525, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0029382705688477 + }, + { + "episode": 24432, + "epoch": 0.14638530395082144, + "loss/policy_avg": -0.04435855150222778, + "lr": 9.024795501022496e-06, + "objective/entropy": -192.6316375732422, + "objective/kl": 35.694820404052734, + "objective/non_score_reward": -1.7847410440444946, + "objective/rlhf_reward": -5.828283181390167, + "objective/scores": 0.327670248696953, + "policy/approxkl_avg": 4.981025218963623, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.63671875, + "step": 1526, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999833345413208 + }, + { + "episode": 24448, + "epoch": 0.146481168589951, + "loss/policy_avg": 0.34963393211364746, + "lr": 9.024156441717792e-06, + "objective/entropy": -228.53794860839844, + "objective/kl": 33.19548034667969, + "objective/non_score_reward": -1.6597739458084106, + "objective/rlhf_reward": -6.639095783233643, + "objective/scores": 0.0, + "policy/approxkl_avg": 26.755781173706055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 1527, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998466968536377 + }, + { + "episode": 24464, + "epoch": 0.14657703322908053, + "loss/policy_avg": 0.36651045083999634, + "lr": 9.023517382413088e-06, + "objective/entropy": -231.26055908203125, + "objective/kl": 41.545196533203125, + "objective/non_score_reward": -2.0772600173950195, + "objective/rlhf_reward": -5.909039950370788, + "objective/scores": 0.6, + "policy/approxkl_avg": 4.609235763549805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.634765625, + "step": 1528, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9972584247589111 + }, + { + "episode": 24480, + "epoch": 0.14667289786821008, + "loss/policy_avg": -0.017867907881736755, + "lr": 9.022878323108385e-06, + "objective/entropy": -229.64715576171875, + "objective/kl": 41.77667999267578, + "objective/non_score_reward": -2.088833808898926, + "objective/rlhf_reward": -6.4079244834946945, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.239856481552124, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7109375, + "step": 1529, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 12, + "val/ratio": 2.0003726482391357 + }, + { + "episode": 24496, + "epoch": 0.14676876250733964, + "loss/policy_avg": 0.09556471556425095, + "lr": 9.022239263803681e-06, + "objective/entropy": -199.2066650390625, + "objective/kl": 37.729129791259766, + "objective/non_score_reward": -1.8864563703536987, + "objective/rlhf_reward": -7.545825362205505, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8835806846618652, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.771484375, + "step": 1530, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 12, + "val/ratio": 2.000250816345215 + }, + { + "episode": 24512, + "epoch": 0.1468646271464692, + "loss/policy_avg": 0.7275862693786621, + "lr": 9.021600204498978e-06, + "objective/entropy": -211.24411010742188, + "objective/kl": 32.49317932128906, + "objective/non_score_reward": -1.6246588230133057, + "objective/rlhf_reward": -6.498635530471802, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.31473159790039, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5859375, + "step": 1531, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.998075246810913 + }, + { + "episode": 24528, + "epoch": 0.14696049178559872, + "loss/policy_avg": 0.20322918891906738, + "lr": 9.020961145194275e-06, + "objective/entropy": -249.6521453857422, + "objective/kl": 25.734882354736328, + "objective/non_score_reward": -1.2867441177368164, + "objective/rlhf_reward": -5.146976351737976, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.7096033096313477, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.697265625, + "step": 1532, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9980659484863281 + }, + { + "episode": 24544, + "epoch": 0.14705635642472828, + "loss/policy_avg": 0.11807098984718323, + "lr": 9.020322085889572e-06, + "objective/entropy": -240.2010955810547, + "objective/kl": 41.3785400390625, + "objective/non_score_reward": -2.0689268112182617, + "objective/rlhf_reward": -8.275707483291626, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5070881843566895, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.748046875, + "step": 1533, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 13, + "val/ratio": 1.9993237257003784 + }, + { + "episode": 24560, + "epoch": 0.14715222106385784, + "loss/policy_avg": -0.06808263808488846, + "lr": 9.019683026584867e-06, + "objective/entropy": -251.21754455566406, + "objective/kl": 28.882801055908203, + "objective/non_score_reward": -1.4441399574279785, + "objective/rlhf_reward": -7.776559829711914, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.311460018157959, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.783203125, + "step": 1534, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9993529319763184 + }, + { + "episode": 24576, + "epoch": 0.1472480857029874, + "loss/policy_avg": 0.9078904390335083, + "lr": 9.019043967280164e-06, + "objective/entropy": -225.9461669921875, + "objective/kl": 28.754737854003906, + "objective/non_score_reward": -1.437736988067627, + "objective/rlhf_reward": -7.750947952270508, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.7066650390625, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.662109375, + "step": 1535, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992409944534302 + }, + { + "episode": 24592, + "epoch": 0.14734395034211692, + "loss/policy_avg": 0.877597987651825, + "lr": 9.018404907975461e-06, + "objective/entropy": -137.12261962890625, + "objective/kl": 45.62024688720703, + "objective/non_score_reward": -2.2810122966766357, + "objective/rlhf_reward": -9.124049186706543, + "objective/scores": 0.0, + "policy/approxkl_avg": 20.099409103393555, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.791015625, + "step": 1536, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988036155700684 + }, + { + "episode": 24608, + "epoch": 0.14743981498124648, + "loss/policy_avg": 0.17912150919437408, + "lr": 9.017765848670758e-06, + "objective/entropy": -277.5224914550781, + "objective/kl": 28.473520278930664, + "objective/non_score_reward": -1.4236760139465332, + "objective/rlhf_reward": -4.032844667852507, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 3.1386327743530273, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69140625, + "step": 1537, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9997999668121338 + }, + { + "episode": 24624, + "epoch": 0.14753567962037603, + "loss/policy_avg": 0.3780398964881897, + "lr": 9.017126789366055e-06, + "objective/entropy": -267.19219970703125, + "objective/kl": 38.658103942871094, + "objective/non_score_reward": -1.9329053163528442, + "objective/rlhf_reward": -6.175361721721247, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 10.717710494995117, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.802734375, + "step": 1538, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.997018575668335 + }, + { + "episode": 24640, + "epoch": 0.1476315442595056, + "loss/policy_avg": -0.08956390619277954, + "lr": 9.01648773006135e-06, + "objective/entropy": -217.35922241210938, + "objective/kl": 31.325536727905273, + "objective/non_score_reward": -1.566277027130127, + "objective/rlhf_reward": -4.317696641163762, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 8.292875289916992, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.939453125, + "step": 1539, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000589609146118 + }, + { + "episode": 24656, + "epoch": 0.14772740889863512, + "loss/policy_avg": 0.014240334741771221, + "lr": 9.015848670756647e-06, + "objective/entropy": -204.67739868164062, + "objective/kl": 37.74090576171875, + "objective/non_score_reward": -1.88704514503479, + "objective/rlhf_reward": -7.548180818557739, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.760732650756836, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.890625, + "step": 1540, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.0004096031188965 + }, + { + "episode": 24672, + "epoch": 0.14782327353776467, + "loss/policy_avg": 0.025121957063674927, + "lr": 9.015209611451944e-06, + "objective/entropy": -180.15380859375, + "objective/kl": 46.799381256103516, + "objective/non_score_reward": -2.3399691581726074, + "objective/rlhf_reward": -11.35987663269043, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.214326858520508, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.798828125, + "step": 1541, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9986114501953125 + }, + { + "episode": 24688, + "epoch": 0.14791913817689423, + "loss/policy_avg": 0.3259159326553345, + "lr": 9.01457055214724e-06, + "objective/entropy": -180.61639404296875, + "objective/kl": 41.78929901123047, + "objective/non_score_reward": -2.0894649028778076, + "objective/rlhf_reward": -8.35785961151123, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.7718939781188965, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62109375, + "step": 1542, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9963030815124512 + }, + { + "episode": 24704, + "epoch": 0.14801500281602378, + "loss/policy_avg": 0.07943452894687653, + "lr": 9.013931492842537e-06, + "objective/entropy": -206.35743713378906, + "objective/kl": 33.90462112426758, + "objective/non_score_reward": -1.6952309608459473, + "objective/rlhf_reward": -6.780924081802368, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.5792279243469238, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6171875, + "step": 1543, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0008997917175293 + }, + { + "episode": 24720, + "epoch": 0.1481108674551533, + "loss/policy_avg": 0.25776898860931396, + "lr": 9.013292433537834e-06, + "objective/entropy": -221.25030517578125, + "objective/kl": 30.62183952331543, + "objective/non_score_reward": -1.5310920476913452, + "objective/rlhf_reward": -6.124368071556091, + "objective/scores": 0.0, + "policy/approxkl_avg": 76.88352966308594, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.810546875, + "step": 1544, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 12, + "val/ratio": 1.998036503791809 + }, + { + "episode": 24736, + "epoch": 0.14820673209428287, + "loss/policy_avg": 0.1526581048965454, + "lr": 9.01265337423313e-06, + "objective/entropy": -274.7589111328125, + "objective/kl": 40.24427032470703, + "objective/non_score_reward": -2.0122134685516357, + "objective/rlhf_reward": -5.648853754997253, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.6714682579040527, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.724609375, + "step": 1545, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9986469745635986 + }, + { + "episode": 24752, + "epoch": 0.14830259673341242, + "loss/policy_avg": -0.08985395729541779, + "lr": 9.012014314928426e-06, + "objective/entropy": -252.19927978515625, + "objective/kl": 39.69486999511719, + "objective/non_score_reward": -1.984743595123291, + "objective/rlhf_reward": -6.334854397837239, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 3.1292824745178223, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.791015625, + "step": 1546, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9977748394012451 + }, + { + "episode": 24768, + "epoch": 0.14839846137254198, + "loss/policy_avg": 0.1944682002067566, + "lr": 9.011375255623722e-06, + "objective/entropy": -162.96212768554688, + "objective/kl": 37.579322814941406, + "objective/non_score_reward": -1.878966212272644, + "objective/rlhf_reward": -7.515864968299866, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.0543575286865234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.626953125, + "step": 1547, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9988646507263184 + }, + { + "episode": 24784, + "epoch": 0.1484943260116715, + "loss/policy_avg": 0.1287703514099121, + "lr": 9.010736196319018e-06, + "objective/entropy": -243.0048065185547, + "objective/kl": 26.16973114013672, + "objective/non_score_reward": -1.3084864616394043, + "objective/rlhf_reward": -7.233946323394775, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.313716888427734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.740234375, + "step": 1548, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9991748332977295 + }, + { + "episode": 24800, + "epoch": 0.14859019065080106, + "loss/policy_avg": 0.03606206923723221, + "lr": 9.010097137014315e-06, + "objective/entropy": -137.3321533203125, + "objective/kl": 47.2755126953125, + "objective/non_score_reward": -2.3637757301330566, + "objective/rlhf_reward": -9.455102682113647, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.434524536132812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.59375, + "step": 1549, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999967336654663 + }, + { + "episode": 24816, + "epoch": 0.14868605528993062, + "loss/policy_avg": 0.6820394396781921, + "lr": 9.009458077709612e-06, + "objective/entropy": -259.4637451171875, + "objective/kl": 39.200157165527344, + "objective/non_score_reward": -1.960007905960083, + "objective/rlhf_reward": -7.840031385421753, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.499387502670288, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.75, + "step": 1550, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9985129833221436 + }, + { + "episode": 24832, + "epoch": 0.14878191992906017, + "loss/policy_avg": 0.3245728015899658, + "lr": 9.008819018404909e-06, + "objective/entropy": -266.9729919433594, + "objective/kl": 44.531394958496094, + "objective/non_score_reward": -2.226569890975952, + "objective/rlhf_reward": -8.906279563903809, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.8533454537391663, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.63671875, + "step": 1551, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0011708736419678 + }, + { + "episode": 24848, + "epoch": 0.1488777845681897, + "loss/policy_avg": 0.1209598034620285, + "lr": 9.008179959100204e-06, + "objective/entropy": -200.48370361328125, + "objective/kl": 39.46515655517578, + "objective/non_score_reward": -1.9732578992843628, + "objective/rlhf_reward": -7.893031597137451, + "objective/scores": 0.0, + "policy/approxkl_avg": 79.04573059082031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.54296875, + "step": 1552, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9970680475234985 + }, + { + "episode": 24864, + "epoch": 0.14897364920731926, + "loss/policy_avg": -0.16851302981376648, + "lr": 9.007540899795501e-06, + "objective/entropy": -206.54364013671875, + "objective/kl": 39.65662384033203, + "objective/non_score_reward": -1.9828312397003174, + "objective/rlhf_reward": -7.931325197219849, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.362776756286621, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.771484375, + "step": 1553, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001493453979492 + }, + { + "episode": 24880, + "epoch": 0.14906951384644881, + "loss/policy_avg": 0.13970156013965607, + "lr": 9.006901840490798e-06, + "objective/entropy": -254.33328247070312, + "objective/kl": 35.31574249267578, + "objective/non_score_reward": -1.765787124633789, + "objective/rlhf_reward": -7.0631484389305115, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.011855125427246, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.73046875, + "step": 1554, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0034103393554688 + }, + { + "episode": 24896, + "epoch": 0.14916537848557837, + "loss/policy_avg": -0.07202109694480896, + "lr": 9.006262781186095e-06, + "objective/entropy": -272.9408264160156, + "objective/kl": 30.194461822509766, + "objective/non_score_reward": -1.5097230672836304, + "objective/rlhf_reward": -6.0388922691345215, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7257907390594482, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.71484375, + "step": 1555, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 2.001354694366455 + }, + { + "episode": 24912, + "epoch": 0.1492612431247079, + "loss/policy_avg": 0.12338749319314957, + "lr": 9.005623721881392e-06, + "objective/entropy": -257.670654296875, + "objective/kl": 32.610836029052734, + "objective/non_score_reward": -1.6305418014526367, + "objective/rlhf_reward": -6.522167325019836, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.9358582496643066, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7734375, + "step": 1556, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.998913049697876 + }, + { + "episode": 24928, + "epoch": 0.14935710776383745, + "loss/policy_avg": 0.4687105119228363, + "lr": 9.004984662576689e-06, + "objective/entropy": -200.3157958984375, + "objective/kl": 56.399009704589844, + "objective/non_score_reward": -2.819950580596924, + "objective/rlhf_reward": -11.279801845550537, + "objective/scores": 0.0, + "policy/approxkl_avg": 29.938013076782227, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.822265625, + "step": 1557, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9996778964996338 + }, + { + "episode": 24944, + "epoch": 0.149452972402967, + "loss/policy_avg": 0.06978608667850494, + "lr": 9.004345603271984e-06, + "objective/entropy": -221.17617797851562, + "objective/kl": 43.83458709716797, + "objective/non_score_reward": -2.1917290687561035, + "objective/rlhf_reward": -7.162796769205647, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 30.487266540527344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.775390625, + "step": 1558, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.00136137008667 + }, + { + "episode": 24960, + "epoch": 0.14954883704209657, + "loss/policy_avg": 0.0826239138841629, + "lr": 9.00370654396728e-06, + "objective/entropy": -212.47731018066406, + "objective/kl": 31.849992752075195, + "objective/non_score_reward": -1.5924997329711914, + "objective/rlhf_reward": -6.369998931884766, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1835784912109375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7421875, + "step": 1559, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9997361898422241 + }, + { + "episode": 24976, + "epoch": 0.14964470168122612, + "loss/policy_avg": -0.1567002236843109, + "lr": 9.003067484662578e-06, + "objective/entropy": -228.3983154296875, + "objective/kl": 40.29692077636719, + "objective/non_score_reward": -2.014846086502075, + "objective/rlhf_reward": -10.0593843460083, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.873206377029419, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.798828125, + "step": 1560, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0014376640319824 + }, + { + "episode": 24992, + "epoch": 0.14974056632035565, + "loss/policy_avg": 0.011781550943851471, + "lr": 9.002428425357874e-06, + "objective/entropy": -164.55712890625, + "objective/kl": 29.503942489624023, + "objective/non_score_reward": -1.475197196006775, + "objective/rlhf_reward": -7.9007887840271, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.115288496017456, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.955078125, + "step": 1561, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.001311779022217 + }, + { + "episode": 25008, + "epoch": 0.1498364309594852, + "loss/policy_avg": 1.3479280471801758, + "lr": 9.001789366053171e-06, + "objective/entropy": -284.59918212890625, + "objective/kl": 27.005859375, + "objective/non_score_reward": -1.3502930402755737, + "objective/rlhf_reward": -5.401172041893005, + "objective/scores": 0.0, + "policy/approxkl_avg": 12.699869155883789, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.748046875, + "step": 1562, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994478225708008 + }, + { + "episode": 25024, + "epoch": 0.14993229559861476, + "loss/policy_avg": -0.07637365162372589, + "lr": 9.001150306748467e-06, + "objective/entropy": -239.9210662841797, + "objective/kl": 37.84478759765625, + "objective/non_score_reward": -1.8922393321990967, + "objective/rlhf_reward": -7.568957090377808, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7365081310272217, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.787109375, + "step": 1563, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.999384880065918 + }, + { + "episode": 25040, + "epoch": 0.15002816023774432, + "loss/policy_avg": 0.2635817229747772, + "lr": 9.000511247443763e-06, + "objective/entropy": -271.03594970703125, + "objective/kl": 43.677249908447266, + "objective/non_score_reward": -2.1838626861572266, + "objective/rlhf_reward": -8.735450506210327, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.7986581325531006, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.734375, + "step": 1564, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.997469425201416 + }, + { + "episode": 25056, + "epoch": 0.15012402487687385, + "loss/policy_avg": -0.19829396903514862, + "lr": 8.99987218813906e-06, + "objective/entropy": -247.4576416015625, + "objective/kl": 44.31961441040039, + "objective/non_score_reward": -2.2159807682037354, + "objective/rlhf_reward": -8.863922953605652, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.949400901794434, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.72265625, + "step": 1565, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9998189210891724 + }, + { + "episode": 25072, + "epoch": 0.1502198895160034, + "loss/policy_avg": 0.22578392922878265, + "lr": 8.999233128834357e-06, + "objective/entropy": -273.71771240234375, + "objective/kl": 39.191856384277344, + "objective/non_score_reward": -1.9595928192138672, + "objective/rlhf_reward": -5.438371396064758, + "objective/scores": 0.6, + "policy/approxkl_avg": 2.645939350128174, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6953125, + "step": 1566, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9982613325119019 + }, + { + "episode": 25088, + "epoch": 0.15031575415513296, + "loss/policy_avg": 0.596666693687439, + "lr": 8.998594069529654e-06, + "objective/entropy": -228.72174072265625, + "objective/kl": 32.305564880371094, + "objective/non_score_reward": -1.6152782440185547, + "objective/rlhf_reward": -8.461112976074219, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.73217010498047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.796875, + "step": 1567, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9973573684692383 + }, + { + "episode": 25104, + "epoch": 0.1504116187942625, + "loss/policy_avg": 0.0043340930715203285, + "lr": 8.99795501022495e-06, + "objective/entropy": -276.8592529296875, + "objective/kl": 53.61993408203125, + "objective/non_score_reward": -2.680996894836426, + "objective/rlhf_reward": -8.77657551594251, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 3.031794548034668, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.74609375, + "step": 1568, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999174952507019 + }, + { + "episode": 25120, + "epoch": 0.15050748343339204, + "loss/policy_avg": -0.1500149965286255, + "lr": 8.997315950920246e-06, + "objective/entropy": -193.1001739501953, + "objective/kl": 39.159427642822266, + "objective/non_score_reward": -1.9579713344573975, + "objective/rlhf_reward": -7.83188533782959, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.881107330322266, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.568359375, + "step": 1569, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0019936561584473 + }, + { + "episode": 25136, + "epoch": 0.1506033480725216, + "loss/policy_avg": 0.5354205369949341, + "lr": 8.996676891615543e-06, + "objective/entropy": -217.83689880371094, + "objective/kl": 38.3505744934082, + "objective/non_score_reward": -1.917528748512268, + "objective/rlhf_reward": -7.670114994049072, + "objective/scores": 0.0, + "policy/approxkl_avg": 65.86680603027344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.72265625, + "step": 1570, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9978712797164917 + }, + { + "episode": 25152, + "epoch": 0.15069921271165115, + "loss/policy_avg": -0.17549648880958557, + "lr": 8.996037832310838e-06, + "objective/entropy": -250.0916290283203, + "objective/kl": 37.26240539550781, + "objective/non_score_reward": -1.8631203174591064, + "objective/rlhf_reward": -7.452481389045715, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.422430992126465, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.734375, + "step": 1571, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.001981019973755 + }, + { + "episode": 25168, + "epoch": 0.1507950773507807, + "loss/policy_avg": 0.2184741199016571, + "lr": 8.995398773006135e-06, + "objective/entropy": -179.99691772460938, + "objective/kl": 23.3301944732666, + "objective/non_score_reward": -1.166509747505188, + "objective/rlhf_reward": -6.666038990020752, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.243851900100708, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.86328125, + "step": 1572, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0004656314849854 + }, + { + "episode": 25184, + "epoch": 0.15089094198991024, + "loss/policy_avg": 0.06072113662958145, + "lr": 8.994759713701432e-06, + "objective/entropy": -265.5483703613281, + "objective/kl": 42.41337585449219, + "objective/non_score_reward": -2.120668888092041, + "objective/rlhf_reward": -10.482675552368164, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.57902193069458, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6640625, + "step": 1573, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0009164810180664 + }, + { + "episode": 25200, + "epoch": 0.1509868066290398, + "loss/policy_avg": 0.24894866347312927, + "lr": 8.994120654396729e-06, + "objective/entropy": -138.1594696044922, + "objective/kl": 28.901180267333984, + "objective/non_score_reward": -1.445059061050415, + "objective/rlhf_reward": -4.4386002331072385, + "objective/scores": 0.33540891336663825, + "policy/approxkl_avg": 2.6422810554504395, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.767578125, + "step": 1574, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9973914623260498 + }, + { + "episode": 25216, + "epoch": 0.15108267126816935, + "loss/policy_avg": 0.028879959136247635, + "lr": 8.993481595092026e-06, + "objective/entropy": -241.52708435058594, + "objective/kl": 33.4459114074707, + "objective/non_score_reward": -1.6722955703735352, + "objective/rlhf_reward": -6.689182162284851, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.7367143630981445, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.791015625, + "step": 1575, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 2.0001864433288574 + }, + { + "episode": 25232, + "epoch": 0.1511785359072989, + "loss/policy_avg": 0.02605835348367691, + "lr": 8.992842535787321e-06, + "objective/entropy": -228.37533569335938, + "objective/kl": 35.62575912475586, + "objective/non_score_reward": -1.7812879085540771, + "objective/rlhf_reward": -7.125151515007019, + "objective/scores": 0.0, + "policy/approxkl_avg": 16.881288528442383, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7421875, + "step": 1576, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9996086359024048 + }, + { + "episode": 25248, + "epoch": 0.15127440054642843, + "loss/policy_avg": 0.04691431671380997, + "lr": 8.992203476482618e-06, + "objective/entropy": -259.7035827636719, + "objective/kl": 37.510040283203125, + "objective/non_score_reward": -1.8755018711090088, + "objective/rlhf_reward": -9.502008438110352, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.13444709777832, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62109375, + "step": 1577, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0005037784576416 + }, + { + "episode": 25264, + "epoch": 0.151370265185558, + "loss/policy_avg": 0.2744201123714447, + "lr": 8.991564417177915e-06, + "objective/entropy": -199.12237548828125, + "objective/kl": 33.50159454345703, + "objective/non_score_reward": -1.6750797033309937, + "objective/rlhf_reward": -3.7765999182474346, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 46.6461181640625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 1578, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997525930404663 + }, + { + "episode": 25280, + "epoch": 0.15146612982468755, + "loss/policy_avg": 0.025500038638710976, + "lr": 8.990925357873212e-06, + "objective/entropy": -201.05966186523438, + "objective/kl": 45.26744842529297, + "objective/non_score_reward": -2.2633724212646484, + "objective/rlhf_reward": -7.106078098492558, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.965685844421387, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.810546875, + "step": 1579, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9994701147079468 + }, + { + "episode": 25296, + "epoch": 0.1515619944638171, + "loss/policy_avg": 0.11785895377397537, + "lr": 8.990286298568508e-06, + "objective/entropy": -213.6715087890625, + "objective/kl": 35.12263488769531, + "objective/non_score_reward": -1.756131649017334, + "objective/rlhf_reward": -4.901820363775764, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.115727186203003, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.849609375, + "step": 1580, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9985606670379639 + }, + { + "episode": 25312, + "epoch": 0.15165785910294663, + "loss/policy_avg": 1.1425952911376953, + "lr": 8.989647239263805e-06, + "objective/entropy": -245.26690673828125, + "objective/kl": 34.19249725341797, + "objective/non_score_reward": -1.709625005722046, + "objective/rlhf_reward": -5.105166331926981, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 116.5212631225586, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.591796875, + "step": 1581, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9963122606277466 + }, + { + "episode": 25328, + "epoch": 0.15175372374207619, + "loss/policy_avg": 0.0455983430147171, + "lr": 8.9890081799591e-06, + "objective/entropy": -242.7713165283203, + "objective/kl": 30.195276260375977, + "objective/non_score_reward": -1.5097639560699463, + "objective/rlhf_reward": -6.0390554666519165, + "objective/scores": 0.0, + "policy/approxkl_avg": 26.28820037841797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.74609375, + "step": 1582, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 12, + "val/ratio": 2.0006484985351562 + }, + { + "episode": 25344, + "epoch": 0.15184958838120574, + "loss/policy_avg": -0.2273516207933426, + "lr": 8.988369120654397e-06, + "objective/entropy": -255.30850219726562, + "objective/kl": 35.210968017578125, + "objective/non_score_reward": -1.7605485916137695, + "objective/rlhf_reward": -5.380334620893585, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 3.561713695526123, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.720703125, + "step": 1583, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9990644454956055 + }, + { + "episode": 25360, + "epoch": 0.1519454530203353, + "loss/policy_avg": 0.09360451996326447, + "lr": 8.987730061349694e-06, + "objective/entropy": -259.33392333984375, + "objective/kl": 35.73432159423828, + "objective/non_score_reward": -1.7867159843444824, + "objective/rlhf_reward": -7.146864175796509, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6448609828948975, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.693359375, + "step": 1584, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9977494478225708 + }, + { + "episode": 25376, + "epoch": 0.15204131765946483, + "loss/policy_avg": 0.39302462339401245, + "lr": 8.987091002044991e-06, + "objective/entropy": -240.88673400878906, + "objective/kl": 26.642396926879883, + "objective/non_score_reward": -1.3321198225021362, + "objective/rlhf_reward": -3.9498770023263514, + "objective/scores": 0.34465054211822604, + "policy/approxkl_avg": 9.123327255249023, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.68359375, + "step": 1585, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9998672008514404 + }, + { + "episode": 25392, + "epoch": 0.15213718229859438, + "loss/policy_avg": 0.19749504327774048, + "lr": 8.986451942740288e-06, + "objective/entropy": -219.55767822265625, + "objective/kl": 42.86522674560547, + "objective/non_score_reward": -2.143261432647705, + "objective/rlhf_reward": -8.573045372962952, + "objective/scores": 0.0, + "policy/approxkl_avg": 17.275981903076172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.744140625, + "step": 1586, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0001418590545654 + }, + { + "episode": 25408, + "epoch": 0.15223304693772394, + "loss/policy_avg": -0.056185394525527954, + "lr": 8.985812883435585e-06, + "objective/entropy": -201.51565551757812, + "objective/kl": 32.736846923828125, + "objective/non_score_reward": -1.6368424892425537, + "objective/rlhf_reward": -6.547369956970215, + "objective/scores": 0.0, + "policy/approxkl_avg": 20.205280303955078, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.72265625, + "step": 1587, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000234842300415 + }, + { + "episode": 25424, + "epoch": 0.1523289115768535, + "loss/policy_avg": 0.24753238260746002, + "lr": 8.98517382413088e-06, + "objective/entropy": -221.9798126220703, + "objective/kl": 45.060585021972656, + "objective/non_score_reward": -2.2530293464660645, + "objective/rlhf_reward": -9.012117326259613, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.459669828414917, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.634765625, + "step": 1588, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9992806911468506 + }, + { + "episode": 25440, + "epoch": 0.15242477621598302, + "loss/policy_avg": 0.6944048404693604, + "lr": 8.984534764826177e-06, + "objective/entropy": -188.84034729003906, + "objective/kl": 35.540496826171875, + "objective/non_score_reward": -1.777024745941162, + "objective/rlhf_reward": -2.708098983764648, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.140187740325928, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.65625, + "step": 1589, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9987516403198242 + }, + { + "episode": 25456, + "epoch": 0.15252064085511258, + "loss/policy_avg": -0.03955763578414917, + "lr": 8.983895705521472e-06, + "objective/entropy": -240.50555419921875, + "objective/kl": 30.987028121948242, + "objective/non_score_reward": -1.5493513345718384, + "objective/rlhf_reward": -8.197404861450195, + "objective/scores": -0.5, + "policy/approxkl_avg": 32.65213394165039, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.626953125, + "step": 1590, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0015077590942383 + }, + { + "episode": 25472, + "epoch": 0.15261650549424213, + "loss/policy_avg": 0.35600098967552185, + "lr": 8.983256646216769e-06, + "objective/entropy": -232.04159545898438, + "objective/kl": 35.47593688964844, + "objective/non_score_reward": -1.7737970352172852, + "objective/rlhf_reward": -7.09518837928772, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.107233762741089, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.720703125, + "step": 1591, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9998784065246582 + }, + { + "episode": 25488, + "epoch": 0.1527123701333717, + "loss/policy_avg": 0.07229488343000412, + "lr": 8.982617586912066e-06, + "objective/entropy": -131.9998779296875, + "objective/kl": 34.63762664794922, + "objective/non_score_reward": -1.7318813800811768, + "objective/rlhf_reward": -6.927525281906128, + "objective/scores": 0.0, + "policy/approxkl_avg": 30.663389205932617, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.798828125, + "step": 1592, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9991148710250854 + }, + { + "episode": 25504, + "epoch": 0.15280823477250122, + "loss/policy_avg": 0.3919386565685272, + "lr": 8.981978527607363e-06, + "objective/entropy": -183.88949584960938, + "objective/kl": 45.78101348876953, + "objective/non_score_reward": -2.289051055908203, + "objective/rlhf_reward": -9.156203627586365, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.686834335327148, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.67578125, + "step": 1593, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9997835159301758 + }, + { + "episode": 25520, + "epoch": 0.15290409941163077, + "loss/policy_avg": 0.4673955738544464, + "lr": 8.98133946830266e-06, + "objective/entropy": -133.40164184570312, + "objective/kl": 39.26948547363281, + "objective/non_score_reward": -1.963474154472351, + "objective/rlhf_reward": -7.853896498680115, + "objective/scores": 0.0, + "policy/approxkl_avg": 32.20845413208008, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.55078125, + "step": 1594, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9985507726669312 + }, + { + "episode": 25536, + "epoch": 0.15299996405076033, + "loss/policy_avg": 0.17131832242012024, + "lr": 8.980700408997955e-06, + "objective/entropy": -115.58807373046875, + "objective/kl": 42.959449768066406, + "objective/non_score_reward": -2.147972345352173, + "objective/rlhf_reward": -8.59188961982727, + "objective/scores": 0.0, + "policy/approxkl_avg": 51.266475677490234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.779296875, + "step": 1595, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990386962890625 + }, + { + "episode": 25552, + "epoch": 0.15309582868988988, + "loss/policy_avg": 0.10576220601797104, + "lr": 8.980061349693252e-06, + "objective/entropy": -187.7340087890625, + "objective/kl": 37.933746337890625, + "objective/non_score_reward": -1.8966872692108154, + "objective/rlhf_reward": -7.586748719215393, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.019711494445801, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.802734375, + "step": 1596, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9997637271881104 + }, + { + "episode": 25568, + "epoch": 0.1531916933290194, + "loss/policy_avg": -0.0002389177680015564, + "lr": 8.979422290388549e-06, + "objective/entropy": -34.37802505493164, + "objective/kl": 53.69154357910156, + "objective/non_score_reward": -2.684577226638794, + "objective/rlhf_reward": -12.73830795288086, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.00918436050415, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.796875, + "step": 1597, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998850703239441 + }, + { + "episode": 25584, + "epoch": 0.15328755796814897, + "loss/policy_avg": 0.05785757303237915, + "lr": 8.978783231083845e-06, + "objective/entropy": -236.28366088867188, + "objective/kl": 37.61219024658203, + "objective/non_score_reward": -1.8806097507476807, + "objective/rlhf_reward": -7.522438883781433, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.319062232971191, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.69921875, + "step": 1598, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981284141540527 + }, + { + "episode": 25600, + "epoch": 0.15338342260727852, + "loss/policy_avg": 0.11516030132770538, + "lr": 8.978144171779142e-06, + "objective/entropy": -233.5134735107422, + "objective/kl": 30.138248443603516, + "objective/non_score_reward": -1.5069124698638916, + "objective/rlhf_reward": -8.027649879455566, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.9873509407043457, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.619140625, + "step": 1599, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000929832458496 + } + ], + "logging_steps": 500, + "max_steps": 7824, + "num_input_tokens_seen": 0, + "num_train_epochs": 3.0, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0, + "train_batch_size": null, + "trial_name": null, + "trial_params": null +}