diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,18034 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "episode": 16000, + "epoch": 0.28759391738864726, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "episode": 16, + "epoch": 0.00028759391738864725, + "loss/policy_avg": -0.014177359640598297, + "lr": 3e-06, + "objective/entropy": 119.65733337402344, + "objective/kl": 15.623376846313477, + "objective/non_score_reward": -1.5623377561569214, + "objective/rlhf_reward": -3.325632084847662, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 472.72821044921875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7515315413475037, + "step": 0, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000829696655273 + }, + { + "episode": 32, + "epoch": 0.0005751878347772945, + "loss/policy_avg": 0.05164449289441109, + "lr": 2.999808282208589e-06, + "objective/entropy": -117.60435485839844, + "objective/kl": 11.686213493347168, + "objective/non_score_reward": -1.168621301651001, + "objective/rlhf_reward": -4.274485094845295, + "objective/scores": 0.1, + "policy/approxkl_avg": 236.72177124023438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6307989358901978, + "step": 1, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973212480545044 + }, + { + "episode": 48, + "epoch": 0.0008627817521659417, + "loss/policy_avg": 0.6165977120399475, + "lr": 2.999616564417178e-06, + "objective/entropy": -116.07769775390625, + "objective/kl": 10.806825637817383, + "objective/non_score_reward": -1.080682635307312, + "objective/rlhf_reward": -3.922730395942926, + "objective/scores": 0.1, + "policy/approxkl_avg": 211.7506103515625, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.726571798324585, + "step": 2, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0005264282226562 + }, + { + "episode": 64, + "epoch": 0.001150375669554589, + "loss/policy_avg": 0.39946672320365906, + "lr": 2.999424846625767e-06, + "objective/entropy": -284.77886962890625, + "objective/kl": 9.179925918579102, + "objective/non_score_reward": -0.9179927110671997, + "objective/rlhf_reward": -3.2719709336757656, + "objective/scores": 0.1, + "policy/approxkl_avg": 172.39312744140625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7219442129135132, + "step": 3, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9991655349731445 + }, + { + "episode": 80, + "epoch": 0.001437969586943236, + "loss/policy_avg": 0.18221884965896606, + "lr": 2.999233128834356e-06, + "objective/entropy": -326.7154541015625, + "objective/kl": 10.727872848510742, + "objective/non_score_reward": -1.0727872848510742, + "objective/rlhf_reward": -1.3674301027667253, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 252.6199188232422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5611602067947388, + "step": 4, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9977035522460938 + }, + { + "episode": 96, + "epoch": 0.0017255635043318834, + "loss/policy_avg": 0.37348473072052, + "lr": 2.999041411042945e-06, + "objective/entropy": -172.4725341796875, + "objective/kl": 9.580272674560547, + "objective/non_score_reward": -0.958027184009552, + "objective/rlhf_reward": -3.43210876584053, + "objective/scores": 0.1, + "policy/approxkl_avg": 233.60519409179688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6069347858428955, + "step": 5, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.004929304122925 + }, + { + "episode": 112, + "epoch": 0.0020131574217205307, + "loss/policy_avg": 0.5359442234039307, + "lr": 2.9988496932515338e-06, + "objective/entropy": 37.751182556152344, + "objective/kl": 8.995965957641602, + "objective/non_score_reward": -0.8995967507362366, + "objective/rlhf_reward": -1.6509756696986513, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 157.80946350097656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.45883142948150635, + "step": 6, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981789588928223 + }, + { + "episode": 128, + "epoch": 0.002300751339109178, + "loss/policy_avg": 0.07628901302814484, + "lr": 2.998657975460123e-06, + "objective/entropy": -271.4947509765625, + "objective/kl": 9.241050720214844, + "objective/non_score_reward": -0.9241052865982056, + "objective/rlhf_reward": 0.7035788685083393, + "objective/scores": 1.1, + "policy/approxkl_avg": 179.53875732421875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6910897493362427, + "step": 7, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9963037967681885 + }, + { + "episode": 144, + "epoch": 0.002588345256497825, + "loss/policy_avg": 0.0354180671274662, + "lr": 2.998466257668712e-06, + "objective/entropy": 209.80404663085938, + "objective/kl": 11.208139419555664, + "objective/non_score_reward": -1.1208139657974243, + "objective/rlhf_reward": -2.8213962368374927, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 217.8009033203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6648087501525879, + "step": 8, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9964534044265747 + }, + { + "episode": 160, + "epoch": 0.002875939173886472, + "loss/policy_avg": 0.24756430089473724, + "lr": 2.9982745398773006e-06, + "objective/entropy": -5.9293365478515625, + "objective/kl": 1.9302005767822266, + "objective/non_score_reward": -0.1930200457572937, + "objective/rlhf_reward": 1.3506260157367849, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 22.118091583251953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6500340700149536, + "step": 9, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.002328395843506 + }, + { + "episode": 176, + "epoch": 0.0031635330912751195, + "loss/policy_avg": 0.22338274121284485, + "lr": 2.99808282208589e-06, + "objective/entropy": -51.18250274658203, + "objective/kl": 4.893694877624512, + "objective/non_score_reward": -0.48936957120895386, + "objective/rlhf_reward": -3.9574780464172363, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.685855865478516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6514552235603333, + "step": 10, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002238750457764 + }, + { + "episode": 192, + "epoch": 0.0034511270086637668, + "loss/policy_avg": 0.07259142398834229, + "lr": 2.9978911042944787e-06, + "objective/entropy": -35.05317306518555, + "objective/kl": 7.698199272155762, + "objective/non_score_reward": -0.769819974899292, + "objective/rlhf_reward": 1.3207200556993488, + "objective/scores": 1.1, + "policy/approxkl_avg": 163.5728759765625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4998992681503296, + "step": 11, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982101917266846 + }, + { + "episode": 208, + "epoch": 0.003738720926052414, + "loss/policy_avg": 0.30875226855278015, + "lr": 2.9976993865030675e-06, + "objective/entropy": 128.93115234375, + "objective/kl": 8.55907154083252, + "objective/non_score_reward": -0.8559072017669678, + "objective/rlhf_reward": -3.0236288517713543, + "objective/scores": 0.1, + "policy/approxkl_avg": 115.09192657470703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6001245975494385, + "step": 12, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99881911277771 + }, + { + "episode": 224, + "epoch": 0.004026314843441061, + "loss/policy_avg": 0.5440771579742432, + "lr": 2.9975076687116563e-06, + "objective/entropy": 194.59161376953125, + "objective/kl": 14.834866523742676, + "objective/non_score_reward": -1.4834866523742676, + "objective/rlhf_reward": -1.5339468330144879, + "objective/scores": 1.1, + "policy/approxkl_avg": 319.5052185058594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.46385329961776733, + "step": 13, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979116916656494 + }, + { + "episode": 240, + "epoch": 0.004313908760829708, + "loss/policy_avg": 0.12125951051712036, + "lr": 2.9973159509202455e-06, + "objective/entropy": 110.74070739746094, + "objective/kl": 6.117404937744141, + "objective/non_score_reward": -0.6117404699325562, + "objective/rlhf_reward": -4.446961879730225, + "objective/scores": -0.5, + "policy/approxkl_avg": 54.780738830566406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6084516048431396, + "step": 14, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986605644226074 + }, + { + "episode": 256, + "epoch": 0.004601502678218356, + "loss/policy_avg": 0.2855263352394104, + "lr": 2.9971242331288343e-06, + "objective/entropy": -80.00950622558594, + "objective/kl": 8.695795059204102, + "objective/non_score_reward": -0.8695796728134155, + "objective/rlhf_reward": -1.3556124068060258, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 134.0033416748047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4744781255722046, + "step": 15, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99568772315979 + }, + { + "episode": 272, + "epoch": 0.004889096595607003, + "loss/policy_avg": 0.34731030464172363, + "lr": 2.996932515337423e-06, + "objective/entropy": -193.7414093017578, + "objective/kl": 6.8158183097839355, + "objective/non_score_reward": -0.6815819144248962, + "objective/rlhf_reward": -0.7789164585637408, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 94.75167083740234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5870293378829956, + "step": 16, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999891996383667 + }, + { + "episode": 288, + "epoch": 0.00517669051299565, + "loss/policy_avg": 0.09466144442558289, + "lr": 2.9967407975460124e-06, + "objective/entropy": 13.335285186767578, + "objective/kl": 7.405551910400391, + "objective/non_score_reward": -0.7405551671981812, + "objective/rlhf_reward": -4.962221145629883, + "objective/scores": -0.5, + "policy/approxkl_avg": 131.025146484375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4989420771598816, + "step": 17, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998366832733154 + }, + { + "episode": 304, + "epoch": 0.0054642844303842975, + "loss/policy_avg": 0.05795682966709137, + "lr": 2.996549079754601e-06, + "objective/entropy": 121.52836608886719, + "objective/kl": 9.26551628112793, + "objective/non_score_reward": -0.9265516996383667, + "objective/rlhf_reward": -3.3062067613005635, + "objective/scores": 0.1, + "policy/approxkl_avg": 123.83927917480469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7211639285087585, + "step": 18, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0007009506225586 + }, + { + "episode": 320, + "epoch": 0.005751878347772944, + "loss/policy_avg": 0.33338093757629395, + "lr": 2.9963573619631904e-06, + "objective/entropy": -9.356884002685547, + "objective/kl": 4.64314079284668, + "objective/non_score_reward": -0.46431419253349304, + "objective/rlhf_reward": -1.4572567533701657, + "objective/scores": 0.1, + "policy/approxkl_avg": 62.679962158203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.545394778251648, + "step": 19, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998852252960205 + }, + { + "episode": 336, + "epoch": 0.006039472265161592, + "loss/policy_avg": 0.1268569827079773, + "lr": 2.9961656441717792e-06, + "objective/entropy": 150.50843811035156, + "objective/kl": 4.864284515380859, + "objective/non_score_reward": -0.48642849922180176, + "objective/rlhf_reward": -0.3894547065168168, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 35.721397399902344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5442019701004028, + "step": 20, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002377986907959 + }, + { + "episode": 352, + "epoch": 0.006327066182550239, + "loss/policy_avg": 0.21250608563423157, + "lr": 2.995973926380368e-06, + "objective/entropy": 80.20552062988281, + "objective/kl": 9.005538940429688, + "objective/non_score_reward": -0.9005540013313293, + "objective/rlhf_reward": -1.8688826049367586, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 145.47787475585938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5761544704437256, + "step": 21, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991693496704102 + }, + { + "episode": 368, + "epoch": 0.006614660099938887, + "loss/policy_avg": 0.049590617418289185, + "lr": 2.9957822085889573e-06, + "objective/entropy": -100.090576171875, + "objective/kl": 12.812070846557617, + "objective/non_score_reward": -1.2812069654464722, + "objective/rlhf_reward": -3.1774167818593337, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 271.628173828125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.636669397354126, + "step": 22, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9962739944458008 + }, + { + "episode": 384, + "epoch": 0.0069022540173275335, + "loss/policy_avg": 0.09350229799747467, + "lr": 2.995590490797546e-06, + "objective/entropy": -59.7061653137207, + "objective/kl": 12.184288024902344, + "objective/non_score_reward": -1.2184288501739502, + "objective/rlhf_reward": -2.751008885280166, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 219.70611572265625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6429183483123779, + "step": 23, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0023770332336426 + }, + { + "episode": 400, + "epoch": 0.00718984793471618, + "loss/policy_avg": 0.05978470295667648, + "lr": 2.995398773006135e-06, + "objective/entropy": 102.0956802368164, + "objective/kl": 4.846743106842041, + "objective/non_score_reward": -0.484674334526062, + "objective/rlhf_reward": 0.9850217357862268, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 55.893699645996094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7100609540939331, + "step": 24, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99696946144104 + }, + { + "episode": 416, + "epoch": 0.007477441852104828, + "loss/policy_avg": 0.047114282846450806, + "lr": 2.995207055214724e-06, + "objective/entropy": -127.81246185302734, + "objective/kl": 9.111923217773438, + "objective/non_score_reward": -0.9111922979354858, + "objective/rlhf_reward": -3.244769042730331, + "objective/scores": 0.1, + "policy/approxkl_avg": 122.56946563720703, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6466059684753418, + "step": 25, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0014986991882324 + }, + { + "episode": 432, + "epoch": 0.007765035769493475, + "loss/policy_avg": 0.7473582029342651, + "lr": 2.995015337423313e-06, + "objective/entropy": 150.059814453125, + "objective/kl": 10.861345291137695, + "objective/non_score_reward": -1.0861345529556274, + "objective/rlhf_reward": -6.34453821182251, + "objective/scores": -0.5, + "policy/approxkl_avg": 136.23809814453125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46276742219924927, + "step": 26, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0018882751464844 + }, + { + "episode": 448, + "epoch": 0.008052629686882123, + "loss/policy_avg": 0.444513201713562, + "lr": 2.994823619631902e-06, + "objective/entropy": -46.63388442993164, + "objective/kl": 8.348082542419434, + "objective/non_score_reward": -0.834808349609375, + "objective/rlhf_reward": -2.939233502745628, + "objective/scores": 0.1, + "policy/approxkl_avg": 137.52908325195312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7123738527297974, + "step": 27, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9967291355133057 + }, + { + "episode": 464, + "epoch": 0.00834022360427077, + "loss/policy_avg": 0.11199073493480682, + "lr": 2.994631901840491e-06, + "objective/entropy": -52.04168701171875, + "objective/kl": 7.455352783203125, + "objective/non_score_reward": -0.7455353736877441, + "objective/rlhf_reward": -0.5821412935853005, + "objective/scores": 0.6, + "policy/approxkl_avg": 92.71267700195312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48107001185417175, + "step": 28, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976396560668945 + }, + { + "episode": 480, + "epoch": 0.008627817521659416, + "loss/policy_avg": 0.11989644169807434, + "lr": 2.9944401840490798e-06, + "objective/entropy": -23.16903305053711, + "objective/kl": 6.455074310302734, + "objective/non_score_reward": -0.6455073356628418, + "objective/rlhf_reward": 1.8179705530405048, + "objective/scores": 1.1, + "policy/approxkl_avg": 83.73835754394531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.43547505140304565, + "step": 29, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984638690948486 + }, + { + "episode": 496, + "epoch": 0.008915411439048063, + "loss/policy_avg": -0.034143999218940735, + "lr": 2.994248466257669e-06, + "objective/entropy": -80.7169418334961, + "objective/kl": 6.607659339904785, + "objective/non_score_reward": -0.6607659459114075, + "objective/rlhf_reward": -0.2430639252066611, + "objective/scores": 0.6, + "policy/approxkl_avg": 48.386138916015625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6621455550193787, + "step": 30, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0041260719299316 + }, + { + "episode": 512, + "epoch": 0.009203005356436712, + "loss/policy_avg": 0.3250073790550232, + "lr": 2.994056748466258e-06, + "objective/entropy": 30.010330200195312, + "objective/kl": 8.299867630004883, + "objective/non_score_reward": -0.8299866914749146, + "objective/rlhf_reward": -0.39622787082311783, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 136.398681640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5553452372550964, + "step": 31, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9970982074737549 + }, + { + "episode": 528, + "epoch": 0.009490599273825359, + "loss/policy_avg": 0.128182053565979, + "lr": 2.9938650306748466e-06, + "objective/entropy": -206.34194946289062, + "objective/kl": 2.580972671508789, + "objective/non_score_reward": -0.25809726119041443, + "objective/rlhf_reward": 1.89132993972185, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 25.516483306884766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4866250157356262, + "step": 32, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0006601810455322 + }, + { + "episode": 544, + "epoch": 0.009778193191214006, + "loss/policy_avg": 0.14243654906749725, + "lr": 2.993673312883436e-06, + "objective/entropy": 26.605953216552734, + "objective/kl": 4.795662879943848, + "objective/non_score_reward": -0.4795662462711334, + "objective/rlhf_reward": 0.029146325810019302, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 41.724029541015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3814541697502136, + "step": 33, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984464645385742 + }, + { + "episode": 560, + "epoch": 0.010065787108602653, + "loss/policy_avg": 0.0956430584192276, + "lr": 2.9934815950920243e-06, + "objective/entropy": -151.531982421875, + "objective/kl": 8.481374740600586, + "objective/non_score_reward": -0.8481374979019165, + "objective/rlhf_reward": -2.9925499673932787, + "objective/scores": 0.1, + "policy/approxkl_avg": 161.83607482910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7393007278442383, + "step": 34, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999760627746582 + }, + { + "episode": 576, + "epoch": 0.0103533810259913, + "loss/policy_avg": 0.20686647295951843, + "lr": 2.9932898773006135e-06, + "objective/entropy": -46.547210693359375, + "objective/kl": 15.390876770019531, + "objective/non_score_reward": -1.5390875339508057, + "objective/rlhf_reward": -3.232631032110426, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 324.61724853515625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.49993449449539185, + "step": 35, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998364448547363 + }, + { + "episode": 592, + "epoch": 0.010640974943379948, + "loss/policy_avg": 0.14229583740234375, + "lr": 2.9930981595092023e-06, + "objective/entropy": 53.75727462768555, + "objective/kl": 9.625295639038086, + "objective/non_score_reward": -0.9625297784805298, + "objective/rlhf_reward": 0.5498811393976215, + "objective/scores": 1.1, + "policy/approxkl_avg": 131.6404266357422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7562763690948486, + "step": 36, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986224174499512 + }, + { + "episode": 608, + "epoch": 0.010928568860768595, + "loss/policy_avg": 0.09236406534910202, + "lr": 2.9929064417177915e-06, + "objective/entropy": 13.225410461425781, + "objective/kl": 6.162755966186523, + "objective/non_score_reward": -0.6162755489349365, + "objective/rlhf_reward": -2.065102344751358, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.094545364379883, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6474248170852661, + "step": 37, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988819360733032 + }, + { + "episode": 624, + "epoch": 0.011216162778157242, + "loss/policy_avg": 0.3340108394622803, + "lr": 2.9927147239263803e-06, + "objective/entropy": 62.37703323364258, + "objective/kl": 4.8724799156188965, + "objective/non_score_reward": -0.4872480034828186, + "objective/rlhf_reward": -1.5489919766783713, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.97528076171875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3959454894065857, + "step": 38, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985570907592773 + }, + { + "episode": 640, + "epoch": 0.011503756695545889, + "loss/policy_avg": 0.028094250708818436, + "lr": 2.992523006134969e-06, + "objective/entropy": -5.561492919921875, + "objective/kl": 11.000988006591797, + "objective/non_score_reward": -1.100098967552185, + "objective/rlhf_reward": -4.000396034121513, + "objective/scores": 0.1, + "policy/approxkl_avg": 196.45921325683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7062462568283081, + "step": 39, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003294944763184 + }, + { + "episode": 656, + "epoch": 0.011791350612934537, + "loss/policy_avg": 0.08169247210025787, + "lr": 2.9923312883435584e-06, + "objective/entropy": -130.83839416503906, + "objective/kl": 8.569768905639648, + "objective/non_score_reward": -0.8569770455360413, + "objective/rlhf_reward": 0.9720918476581577, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.26432037353516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7602853775024414, + "step": 40, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99922776222229 + }, + { + "episode": 672, + "epoch": 0.012078944530323184, + "loss/policy_avg": 0.0993257462978363, + "lr": 2.992139570552147e-06, + "objective/entropy": 204.11431884765625, + "objective/kl": 5.267461776733398, + "objective/non_score_reward": -0.5267462134361267, + "objective/rlhf_reward": -1.7069848686456681, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.98307800292969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5099983215332031, + "step": 41, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977500438690186 + }, + { + "episode": 688, + "epoch": 0.012366538447711831, + "loss/policy_avg": 0.14465492963790894, + "lr": 2.9919478527607364e-06, + "objective/entropy": 65.64169311523438, + "objective/kl": 8.029642105102539, + "objective/non_score_reward": -0.8029642701148987, + "objective/rlhf_reward": 1.1881429269909862, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.06425476074219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4883229732513428, + "step": 42, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979116916656494 + }, + { + "episode": 704, + "epoch": 0.012654132365100478, + "loss/policy_avg": 0.3832010328769684, + "lr": 2.9917561349693252e-06, + "objective/entropy": -72.26643371582031, + "objective/kl": 8.761590957641602, + "objective/non_score_reward": -0.876159131526947, + "objective/rlhf_reward": -5.504636764526367, + "objective/scores": -0.5, + "policy/approxkl_avg": 114.79124450683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7940360903739929, + "step": 43, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983117580413818 + }, + { + "episode": 720, + "epoch": 0.012941726282489125, + "loss/policy_avg": 0.1457168310880661, + "lr": 2.991564417177914e-06, + "objective/entropy": 215.26284790039062, + "objective/kl": 8.928382873535156, + "objective/non_score_reward": -0.8928384184837341, + "objective/rlhf_reward": 0.8286463558673862, + "objective/scores": 1.1, + "policy/approxkl_avg": 73.42234802246094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5377756953239441, + "step": 44, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967031478881836 + }, + { + "episode": 736, + "epoch": 0.013229320199877773, + "loss/policy_avg": 0.5347464680671692, + "lr": 2.9913726993865033e-06, + "objective/entropy": -0.6218109130859375, + "objective/kl": 10.952564239501953, + "objective/non_score_reward": -1.0952564477920532, + "objective/rlhf_reward": -1.98102588057518, + "objective/scores": 0.6, + "policy/approxkl_avg": 208.24761962890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8326528072357178, + "step": 45, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986271858215332 + }, + { + "episode": 752, + "epoch": 0.01351691411726642, + "loss/policy_avg": 0.08795450627803802, + "lr": 2.991180981595092e-06, + "objective/entropy": 53.97735595703125, + "objective/kl": 9.161317825317383, + "objective/non_score_reward": -0.9161317348480225, + "objective/rlhf_reward": -3.2645270287990567, + "objective/scores": 0.1, + "policy/approxkl_avg": 139.39212036132812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3978268504142761, + "step": 46, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986640214920044 + }, + { + "episode": 768, + "epoch": 0.013804508034655067, + "loss/policy_avg": 0.22336724400520325, + "lr": 2.990989263803681e-06, + "objective/entropy": 95.49320983886719, + "objective/kl": 6.099149703979492, + "objective/non_score_reward": -0.6099148988723755, + "objective/rlhf_reward": 0.4840593293893609, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 63.69355773925781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6312753558158875, + "step": 47, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990160465240479 + }, + { + "episode": 784, + "epoch": 0.014092101952043714, + "loss/policy_avg": -0.4834544062614441, + "lr": 2.99079754601227e-06, + "objective/entropy": 103.409912109375, + "objective/kl": 8.192754745483398, + "objective/non_score_reward": -0.8192753195762634, + "objective/rlhf_reward": -0.8771014422178267, + "objective/scores": 0.6, + "policy/approxkl_avg": 209.68890380859375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6371721029281616, + "step": 48, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002427101135254 + }, + { + "episode": 800, + "epoch": 0.01437969586943236, + "loss/policy_avg": 0.0012040697038173676, + "lr": 2.990605828220859e-06, + "objective/entropy": 278.3375244140625, + "objective/kl": 15.085844039916992, + "objective/non_score_reward": -1.5085842609405518, + "objective/rlhf_reward": -5.6343372169882056, + "objective/scores": 0.1, + "policy/approxkl_avg": 569.4739379882812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8680420517921448, + "step": 49, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974499940872192 + }, + { + "episode": 816, + "epoch": 0.01466728978682101, + "loss/policy_avg": 0.08105640113353729, + "lr": 2.990414110429448e-06, + "objective/entropy": 82.5201416015625, + "objective/kl": 14.61972713470459, + "objective/non_score_reward": -1.461972713470459, + "objective/rlhf_reward": -7.847890853881836, + "objective/scores": -0.5, + "policy/approxkl_avg": 473.65753173828125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8534562587738037, + "step": 50, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0009145736694336 + }, + { + "episode": 832, + "epoch": 0.014954883704209656, + "loss/policy_avg": 0.40666699409484863, + "lr": 2.990222392638037e-06, + "objective/entropy": -77.4280014038086, + "objective/kl": 7.943630218505859, + "objective/non_score_reward": -0.7943630814552307, + "objective/rlhf_reward": -0.2537333711397376, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 91.87364196777344, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6725068092346191, + "step": 51, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985764026641846 + }, + { + "episode": 848, + "epoch": 0.015242477621598303, + "loss/policy_avg": 0.37404656410217285, + "lr": 2.990030674846626e-06, + "objective/entropy": 48.54077911376953, + "objective/kl": 11.823626518249512, + "objective/non_score_reward": -1.1823625564575195, + "objective/rlhf_reward": -6.729450225830078, + "objective/scores": -0.5, + "policy/approxkl_avg": 223.62557983398438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7328237295150757, + "step": 52, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987246990203857 + }, + { + "episode": 864, + "epoch": 0.01553007153898695, + "loss/policy_avg": 0.5382946729660034, + "lr": 2.989838957055215e-06, + "objective/entropy": -190.2376708984375, + "objective/kl": 9.007357597351074, + "objective/non_score_reward": -0.9007357954978943, + "objective/rlhf_reward": -0.6792241677057472, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 168.06661987304688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4519304931163788, + "step": 53, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998849868774414 + }, + { + "episode": 880, + "epoch": 0.0158176654563756, + "loss/policy_avg": 0.4903010427951813, + "lr": 2.989647239263804e-06, + "objective/entropy": 7.207241058349609, + "objective/kl": 4.414880275726318, + "objective/non_score_reward": -0.44148799777030945, + "objective/rlhf_reward": 2.634048038721085, + "objective/scores": 1.1, + "policy/approxkl_avg": 44.266204833984375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.747165858745575, + "step": 54, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998467206954956 + }, + { + "episode": 896, + "epoch": 0.016105259373764245, + "loss/policy_avg": 0.009914087131619453, + "lr": 2.989455521472393e-06, + "objective/entropy": -35.93499755859375, + "objective/kl": 8.610208511352539, + "objective/non_score_reward": -0.8610208630561829, + "objective/rlhf_reward": -1.6192545398798694, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 146.18605041503906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6474767923355103, + "step": 55, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002448558807373 + }, + { + "episode": 912, + "epoch": 0.016392853291152892, + "loss/policy_avg": 0.009470928460359573, + "lr": 2.9892638036809815e-06, + "objective/entropy": -125.8683853149414, + "objective/kl": 9.624561309814453, + "objective/non_score_reward": -0.9624560475349426, + "objective/rlhf_reward": -0.9261052950632301, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 156.87704467773438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4992007613182068, + "step": 56, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993857145309448 + }, + { + "episode": 928, + "epoch": 0.01668044720854154, + "loss/policy_avg": 0.30396610498428345, + "lr": 2.9890720858895707e-06, + "objective/entropy": 47.94700622558594, + "objective/kl": 8.891968727111816, + "objective/non_score_reward": -0.8891968727111816, + "objective/rlhf_reward": -5.556787490844727, + "objective/scores": -0.5, + "policy/approxkl_avg": 125.919189453125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7451653480529785, + "step": 57, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991803169250488 + }, + { + "episode": 944, + "epoch": 0.016968041125930186, + "loss/policy_avg": -0.1904684454202652, + "lr": 2.9888803680981595e-06, + "objective/entropy": 228.5303192138672, + "objective/kl": 4.137008190155029, + "objective/non_score_reward": -0.4137008786201477, + "objective/rlhf_reward": -1.25480350703001, + "objective/scores": 0.1, + "policy/approxkl_avg": 51.00769805908203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6238487958908081, + "step": 58, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0140719413757324 + }, + { + "episode": 960, + "epoch": 0.017255635043318833, + "loss/policy_avg": 0.8186465501785278, + "lr": 2.9886886503067483e-06, + "objective/entropy": -20.18294906616211, + "objective/kl": 10.374330520629883, + "objective/non_score_reward": -1.03743314743042, + "objective/rlhf_reward": -1.7497324258089064, + "objective/scores": 0.6, + "policy/approxkl_avg": 178.15145874023438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5619127750396729, + "step": 59, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0021634101867676 + }, + { + "episode": 976, + "epoch": 0.01754322896070748, + "loss/policy_avg": 0.45171883702278137, + "lr": 2.9884969325153375e-06, + "objective/entropy": 44.993682861328125, + "objective/kl": 7.884735584259033, + "objective/non_score_reward": -0.78847336769104, + "objective/rlhf_reward": -2.75389347076416, + "objective/scores": 0.1, + "policy/approxkl_avg": 131.51107788085938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5738007426261902, + "step": 60, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988288879394531 + }, + { + "episode": 992, + "epoch": 0.017830822878096127, + "loss/policy_avg": 0.697486162185669, + "lr": 2.9883052147239263e-06, + "objective/entropy": 26.72112274169922, + "objective/kl": 7.41924524307251, + "objective/non_score_reward": -0.7419244647026062, + "objective/rlhf_reward": -0.5676979482173918, + "objective/scores": 0.6, + "policy/approxkl_avg": 87.49612426757812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.673904538154602, + "step": 61, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979908466339111 + }, + { + "episode": 1008, + "epoch": 0.018118416795484777, + "loss/policy_avg": 0.08780250698328018, + "lr": 2.988113496932515e-06, + "objective/entropy": 144.9136962890625, + "objective/kl": 7.059360504150391, + "objective/non_score_reward": -0.705936074256897, + "objective/rlhf_reward": 0.09997491097333766, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 81.5174560546875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7397779226303101, + "step": 62, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999450922012329 + }, + { + "episode": 1024, + "epoch": 0.018406010712873424, + "loss/policy_avg": 0.6007635593414307, + "lr": 2.9879217791411044e-06, + "objective/entropy": 116.3339614868164, + "objective/kl": 7.176075458526611, + "objective/non_score_reward": -0.7176075577735901, + "objective/rlhf_reward": -2.470430406183004, + "objective/scores": 0.1, + "policy/approxkl_avg": 73.61609649658203, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4787016808986664, + "step": 63, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989898204803467 + }, + { + "episode": 1040, + "epoch": 0.01869360463026207, + "loss/policy_avg": 0.14422942698001862, + "lr": 2.987730061349693e-06, + "objective/entropy": 124.52241516113281, + "objective/kl": 9.423843383789062, + "objective/non_score_reward": -0.9423844218254089, + "objective/rlhf_reward": -0.8458185240041939, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 138.37496948242188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8451459407806396, + "step": 64, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998455286026001 + }, + { + "episode": 1056, + "epoch": 0.018981198547650718, + "loss/policy_avg": 0.38644856214523315, + "lr": 2.9875383435582824e-06, + "objective/entropy": -148.96302795410156, + "objective/kl": 6.528387546539307, + "objective/non_score_reward": -0.6528387665748596, + "objective/rlhf_reward": -0.4886488600828983, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 81.52159881591797, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.643337607383728, + "step": 65, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999079942703247 + }, + { + "episode": 1072, + "epoch": 0.019268792465039365, + "loss/policy_avg": -0.05883178487420082, + "lr": 2.9873466257668712e-06, + "objective/entropy": -121.89275360107422, + "objective/kl": 7.966899871826172, + "objective/non_score_reward": -0.7966899871826172, + "objective/rlhf_reward": -2.786759978532791, + "objective/scores": 0.1, + "policy/approxkl_avg": 141.0238037109375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6699905395507812, + "step": 66, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981606006622314 + }, + { + "episode": 1088, + "epoch": 0.01955638638242801, + "loss/policy_avg": 0.21825438737869263, + "lr": 2.98715490797546e-06, + "objective/entropy": 17.15899658203125, + "objective/kl": 11.302406311035156, + "objective/non_score_reward": -1.130240559577942, + "objective/rlhf_reward": -6.520962715148926, + "objective/scores": -0.5, + "policy/approxkl_avg": 198.47238159179688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7949972748756409, + "step": 67, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980106353759766 + }, + { + "episode": 1104, + "epoch": 0.019843980299816658, + "loss/policy_avg": 0.2142024040222168, + "lr": 2.9869631901840493e-06, + "objective/entropy": -47.186798095703125, + "objective/kl": 10.205244064331055, + "objective/non_score_reward": -1.0205243825912476, + "objective/rlhf_reward": -3.682097455859184, + "objective/scores": 0.1, + "policy/approxkl_avg": 154.7986297607422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6624069213867188, + "step": 68, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982914924621582 + }, + { + "episode": 1120, + "epoch": 0.020131574217205305, + "loss/policy_avg": 0.38636407256126404, + "lr": 2.986771472392638e-06, + "objective/entropy": -56.353668212890625, + "objective/kl": 9.813121795654297, + "objective/non_score_reward": -0.9813121557235718, + "objective/rlhf_reward": -5.925248146057129, + "objective/scores": -0.5, + "policy/approxkl_avg": 89.94273376464844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.49075233936309814, + "step": 69, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0006632804870605 + }, + { + "episode": 1136, + "epoch": 0.020419168134593952, + "loss/policy_avg": 0.1683022379875183, + "lr": 2.9865797546012273e-06, + "objective/entropy": 218.84620666503906, + "objective/kl": 18.28194808959961, + "objective/non_score_reward": -1.8281950950622559, + "objective/rlhf_reward": -4.912780082225799, + "objective/scores": 0.6, + "policy/approxkl_avg": 422.34417724609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5468066930770874, + "step": 70, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993388652801514 + }, + { + "episode": 1152, + "epoch": 0.0207067620519826, + "loss/policy_avg": 0.12364174425601959, + "lr": 2.986388036809816e-06, + "objective/entropy": -157.8475341796875, + "objective/kl": 13.170989036560059, + "objective/non_score_reward": -1.3170989751815796, + "objective/rlhf_reward": -7.26839542388916, + "objective/scores": -0.5, + "policy/approxkl_avg": 184.91448974609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8540889620780945, + "step": 71, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9987365007400513 + }, + { + "episode": 1168, + "epoch": 0.02099435596937125, + "loss/policy_avg": 0.03804938867688179, + "lr": 2.986196319018405e-06, + "objective/entropy": 12.461807250976562, + "objective/kl": 7.584700107574463, + "objective/non_score_reward": -0.7584700584411621, + "objective/rlhf_reward": -0.11016108536836766, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 53.31656265258789, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7939921617507935, + "step": 72, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0002026557922363 + }, + { + "episode": 1184, + "epoch": 0.021281949886759896, + "loss/policy_avg": 0.4785844683647156, + "lr": 2.986004601226994e-06, + "objective/entropy": -64.63442993164062, + "objective/kl": 13.00765609741211, + "objective/non_score_reward": -1.3007656335830688, + "objective/rlhf_reward": -4.803062631189823, + "objective/scores": 0.1, + "policy/approxkl_avg": 264.447998046875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6026296615600586, + "step": 73, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995261430740356 + }, + { + "episode": 1200, + "epoch": 0.021569543804148543, + "loss/policy_avg": 0.22290995717048645, + "lr": 2.985812883435583e-06, + "objective/entropy": -106.69702911376953, + "objective/kl": 13.168065071105957, + "objective/non_score_reward": -1.316806435585022, + "objective/rlhf_reward": -7.267225742340088, + "objective/scores": -0.5, + "policy/approxkl_avg": 273.555419921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6650924682617188, + "step": 74, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982670545578003 + }, + { + "episode": 1216, + "epoch": 0.02185713772153719, + "loss/policy_avg": 0.29405301809310913, + "lr": 2.985621165644172e-06, + "objective/entropy": 133.62835693359375, + "objective/kl": 8.014554023742676, + "objective/non_score_reward": -0.8014553189277649, + "objective/rlhf_reward": -2.8058213055133816, + "objective/scores": 0.1, + "policy/approxkl_avg": 96.36099243164062, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7634880542755127, + "step": 75, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991488456726074 + }, + { + "episode": 1232, + "epoch": 0.022144731638925837, + "loss/policy_avg": 0.45445433259010315, + "lr": 2.985429447852761e-06, + "objective/entropy": 82.93301391601562, + "objective/kl": 8.670784950256348, + "objective/non_score_reward": -0.8670786023139954, + "objective/rlhf_reward": -1.0683142602443696, + "objective/scores": 0.6, + "policy/approxkl_avg": 148.96737670898438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.540373682975769, + "step": 76, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980230331420898 + }, + { + "episode": 1248, + "epoch": 0.022432325556314484, + "loss/policy_avg": 0.020047597587108612, + "lr": 2.98523773006135e-06, + "objective/entropy": -212.8873291015625, + "objective/kl": 5.708805561065674, + "objective/non_score_reward": -0.5708805918693542, + "objective/rlhf_reward": 0.6401966915118966, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 42.32015609741211, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7310128211975098, + "step": 77, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9995827674865723 + }, + { + "episode": 1264, + "epoch": 0.02271991947370313, + "loss/policy_avg": -0.00048611266538500786, + "lr": 2.9850460122699387e-06, + "objective/entropy": -48.7366943359375, + "objective/kl": 6.82136344909668, + "objective/non_score_reward": -0.6821364164352417, + "objective/rlhf_reward": -2.3285456061363217, + "objective/scores": 0.1, + "policy/approxkl_avg": 66.36034393310547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4771096706390381, + "step": 78, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971214532852173 + }, + { + "episode": 1280, + "epoch": 0.023007513391091777, + "loss/policy_avg": 0.4775531589984894, + "lr": 2.9848542944785275e-06, + "objective/entropy": -153.65670776367188, + "objective/kl": 11.706863403320312, + "objective/non_score_reward": -1.1706863641738892, + "objective/rlhf_reward": -3.078625496391373, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 175.5146484375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5806171894073486, + "step": 79, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999029278755188 + }, + { + "episode": 1296, + "epoch": 0.023295107308480424, + "loss/policy_avg": 0.46404796838760376, + "lr": 2.9846625766871167e-06, + "objective/entropy": -0.7676467895507812, + "objective/kl": 9.653882026672363, + "objective/non_score_reward": -0.9653880596160889, + "objective/rlhf_reward": -3.461552521586418, + "objective/scores": 0.1, + "policy/approxkl_avg": 124.8095703125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6914379596710205, + "step": 80, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988126754760742 + }, + { + "episode": 1312, + "epoch": 0.023582701225869074, + "loss/policy_avg": 0.1949668675661087, + "lr": 2.9844708588957055e-06, + "objective/entropy": -137.8944549560547, + "objective/kl": 9.551393508911133, + "objective/non_score_reward": -0.9551393985748291, + "objective/rlhf_reward": 0.5794423833489422, + "objective/scores": 1.1, + "policy/approxkl_avg": 131.54342651367188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7368413209915161, + "step": 81, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991984367370605 + }, + { + "episode": 1328, + "epoch": 0.02387029514325772, + "loss/policy_avg": 0.08234795928001404, + "lr": 2.9842791411042943e-06, + "objective/entropy": -301.7047119140625, + "objective/kl": 11.51591682434082, + "objective/non_score_reward": -1.1515917778015137, + "objective/rlhf_reward": -4.20636705160141, + "objective/scores": 0.1, + "policy/approxkl_avg": 113.93853759765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.785065770149231, + "step": 82, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992117881774902 + }, + { + "episode": 1344, + "epoch": 0.024157889060646368, + "loss/policy_avg": 0.11510531604290009, + "lr": 2.9840874233128835e-06, + "objective/entropy": -97.20633697509766, + "objective/kl": 10.416614532470703, + "objective/non_score_reward": -1.0416613817214966, + "objective/rlhf_reward": -2.3418168380585422, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 169.93270874023438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6234397888183594, + "step": 83, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0019311904907227 + }, + { + "episode": 1360, + "epoch": 0.024445482978035015, + "loss/policy_avg": -0.0069921668618917465, + "lr": 2.9838957055214724e-06, + "objective/entropy": 30.303848266601562, + "objective/kl": 8.926748275756836, + "objective/non_score_reward": -0.8926749229431152, + "objective/rlhf_reward": -5.570699691772461, + "objective/scores": -0.5, + "policy/approxkl_avg": 76.47716522216797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5039442181587219, + "step": 84, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998093843460083 + }, + { + "episode": 1376, + "epoch": 0.024733076895423662, + "loss/policy_avg": 0.27738839387893677, + "lr": 2.983703987730061e-06, + "objective/entropy": -166.7930450439453, + "objective/kl": 6.530454635620117, + "objective/non_score_reward": -0.6530454754829407, + "objective/rlhf_reward": -2.2121819466352464, + "objective/scores": 0.1, + "policy/approxkl_avg": 59.98471450805664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.90630704164505, + "step": 85, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998368263244629 + }, + { + "episode": 1392, + "epoch": 0.02502067081281231, + "loss/policy_avg": 0.22002673149108887, + "lr": 2.9835122699386504e-06, + "objective/entropy": -67.69169616699219, + "objective/kl": 11.10516357421875, + "objective/non_score_reward": -1.1105163097381592, + "objective/rlhf_reward": -2.6172365203228702, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 100.68114471435547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6418424844741821, + "step": 86, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9970009326934814 + }, + { + "episode": 1408, + "epoch": 0.025308264730200956, + "loss/policy_avg": 0.41238439083099365, + "lr": 2.983320552147239e-06, + "objective/entropy": -16.58879852294922, + "objective/kl": 8.708709716796875, + "objective/non_score_reward": -0.8708709478378296, + "objective/rlhf_reward": -5.483483791351318, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.49099731445312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6647894978523254, + "step": 87, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9938620328903198 + }, + { + "episode": 1424, + "epoch": 0.025595858647589603, + "loss/policy_avg": -0.10927846282720566, + "lr": 2.9831288343558284e-06, + "objective/entropy": 16.377220153808594, + "objective/kl": 13.530142784118652, + "objective/non_score_reward": -1.353014349937439, + "objective/rlhf_reward": -3.2893511898079257, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 238.56805419921875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6255956888198853, + "step": 88, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985899925231934 + }, + { + "episode": 1440, + "epoch": 0.02588345256497825, + "loss/policy_avg": 0.1418202817440033, + "lr": 2.9829371165644172e-06, + "objective/entropy": 156.89816284179688, + "objective/kl": 12.802512168884277, + "objective/non_score_reward": -1.2802512645721436, + "objective/rlhf_reward": -3.296176548275064, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 294.27996826171875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6715450286865234, + "step": 89, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973416328430176 + }, + { + "episode": 1456, + "epoch": 0.026171046482366896, + "loss/policy_avg": 0.008285747841000557, + "lr": 2.982745398773006e-06, + "objective/entropy": 97.516357421875, + "objective/kl": 9.460161209106445, + "objective/non_score_reward": -0.9460161328315735, + "objective/rlhf_reward": 0.6159354835748676, + "objective/scores": 1.1, + "policy/approxkl_avg": 122.57460021972656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8431274890899658, + "step": 90, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984780550003052 + }, + { + "episode": 1472, + "epoch": 0.026458640399755547, + "loss/policy_avg": 0.5656089782714844, + "lr": 2.9825536809815953e-06, + "objective/entropy": 103.79216766357422, + "objective/kl": 8.058956146240234, + "objective/non_score_reward": -0.8058955669403076, + "objective/rlhf_reward": -2.8235821485519406, + "objective/scores": 0.1, + "policy/approxkl_avg": 123.79965209960938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46594178676605225, + "step": 91, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9949241876602173 + }, + { + "episode": 1488, + "epoch": 0.026746234317144194, + "loss/policy_avg": 0.6543309688568115, + "lr": 2.982361963190184e-06, + "objective/entropy": -186.4047393798828, + "objective/kl": 11.067312240600586, + "objective/non_score_reward": -1.1067311763763428, + "objective/rlhf_reward": -2.479513700084622, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 161.9315948486328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6305772066116333, + "step": 92, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993164539337158 + }, + { + "episode": 1504, + "epoch": 0.02703382823453284, + "loss/policy_avg": 0.19107398390769958, + "lr": 2.9821702453987733e-06, + "objective/entropy": -34.54255294799805, + "objective/kl": 7.058377265930176, + "objective/non_score_reward": -0.7058378458023071, + "objective/rlhf_reward": 1.5766486465930942, + "objective/scores": 1.1, + "policy/approxkl_avg": 83.1898193359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5186392068862915, + "step": 93, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9963679313659668 + }, + { + "episode": 1520, + "epoch": 0.027321422151921487, + "loss/policy_avg": 0.0312882624566555, + "lr": 2.981978527607362e-06, + "objective/entropy": 19.131616592407227, + "objective/kl": 7.02040958404541, + "objective/non_score_reward": -0.7020410299301147, + "objective/rlhf_reward": -4.808164119720459, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.332481384277344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6216336488723755, + "step": 94, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9971067905426025 + }, + { + "episode": 1536, + "epoch": 0.027609016069310134, + "loss/policy_avg": 0.08140967786312103, + "lr": 2.981786809815951e-06, + "objective/entropy": 13.800872802734375, + "objective/kl": 6.789825439453125, + "objective/non_score_reward": -0.6789825558662415, + "objective/rlhf_reward": -4.715930461883545, + "objective/scores": -0.5, + "policy/approxkl_avg": 50.50202560424805, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5023022890090942, + "step": 95, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999751329421997 + }, + { + "episode": 1552, + "epoch": 0.02789660998669878, + "loss/policy_avg": 0.1259385198354721, + "lr": 2.98159509202454e-06, + "objective/entropy": 1.0717048645019531, + "objective/kl": 10.119159698486328, + "objective/non_score_reward": -1.011915922164917, + "objective/rlhf_reward": -3.6476640462875363, + "objective/scores": 0.1, + "policy/approxkl_avg": 155.03924560546875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5893478393554688, + "step": 96, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999608039855957 + }, + { + "episode": 1568, + "epoch": 0.028184203904087428, + "loss/policy_avg": 0.35828953981399536, + "lr": 2.981403374233129e-06, + "objective/entropy": 90.25312042236328, + "objective/kl": 18.374267578125, + "objective/non_score_reward": -1.8374266624450684, + "objective/rlhf_reward": -2.949706649780273, + "objective/scores": 1.1, + "policy/approxkl_avg": 272.5823669433594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4844147562980652, + "step": 97, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982144832611084 + }, + { + "episode": 1584, + "epoch": 0.028471797821476075, + "loss/policy_avg": 0.37511101365089417, + "lr": 2.981211656441718e-06, + "objective/entropy": 57.79621505737305, + "objective/kl": 11.18044662475586, + "objective/non_score_reward": -1.1180447340011597, + "objective/rlhf_reward": -6.4721784591674805, + "objective/scores": -0.5, + "policy/approxkl_avg": 282.01220703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4804549217224121, + "step": 98, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971604347229004 + }, + { + "episode": 1600, + "epoch": 0.02875939173886472, + "loss/policy_avg": 0.43748798966407776, + "lr": 2.981019938650307e-06, + "objective/entropy": -91.33480834960938, + "objective/kl": 9.78958797454834, + "objective/non_score_reward": -0.9789588451385498, + "objective/rlhf_reward": -5.915835380554199, + "objective/scores": -0.5, + "policy/approxkl_avg": 210.7143096923828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6887623071670532, + "step": 99, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999546766281128 + }, + { + "episode": 1616, + "epoch": 0.029046985656253372, + "loss/policy_avg": 0.012114331126213074, + "lr": 2.980828220858896e-06, + "objective/entropy": 21.23130226135254, + "objective/kl": 3.5349526405334473, + "objective/non_score_reward": -0.3534952402114868, + "objective/rlhf_reward": 1.509737993835238, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 18.795394897460938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.649235188961029, + "step": 100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0008366107940674 + }, + { + "episode": 1632, + "epoch": 0.02933457957364202, + "loss/policy_avg": 0.08204736560583115, + "lr": 2.9806365030674847e-06, + "objective/entropy": -141.25718688964844, + "objective/kl": 11.29146957397461, + "objective/non_score_reward": -1.1291468143463135, + "objective/rlhf_reward": -0.11658706367015803, + "objective/scores": 1.1, + "policy/approxkl_avg": 166.33197021484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6492342352867126, + "step": 101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9966166019439697 + }, + { + "episode": 1648, + "epoch": 0.029622173491030666, + "loss/policy_avg": 0.08084648847579956, + "lr": 2.9804447852760735e-06, + "objective/entropy": 111.48165893554688, + "objective/kl": 7.421027183532715, + "objective/non_score_reward": -0.7421026825904846, + "objective/rlhf_reward": 1.4315892295911912, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.68070983886719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7161847949028015, + "step": 102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976788759231567 + }, + { + "episode": 1664, + "epoch": 0.029909767408419313, + "loss/policy_avg": 0.34747451543807983, + "lr": 2.9802530674846627e-06, + "objective/entropy": 11.092723846435547, + "objective/kl": 10.786249160766602, + "objective/non_score_reward": -1.078624963760376, + "objective/rlhf_reward": -6.314499855041504, + "objective/scores": -0.5, + "policy/approxkl_avg": 129.8665008544922, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6497814655303955, + "step": 103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999152660369873 + }, + { + "episode": 1680, + "epoch": 0.03019736132580796, + "loss/policy_avg": 0.19717364013195038, + "lr": 2.9800613496932515e-06, + "objective/entropy": 84.13946533203125, + "objective/kl": 14.11801528930664, + "objective/non_score_reward": -1.4118015766143799, + "objective/rlhf_reward": -7.6472063064575195, + "objective/scores": -0.5, + "policy/approxkl_avg": 345.1640319824219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7492287158966064, + "step": 104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9964749813079834 + }, + { + "episode": 1696, + "epoch": 0.030484955243196606, + "loss/policy_avg": 0.31150949001312256, + "lr": 2.9798696319018403e-06, + "objective/entropy": 189.52505493164062, + "objective/kl": 6.602322578430176, + "objective/non_score_reward": -0.6602323055267334, + "objective/rlhf_reward": -2.2409292221069332, + "objective/scores": 0.1, + "policy/approxkl_avg": 36.76777648925781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7701338529586792, + "step": 105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979405403137207 + }, + { + "episode": 1712, + "epoch": 0.030772549160585253, + "loss/policy_avg": 0.863737940788269, + "lr": 2.9796779141104296e-06, + "objective/entropy": -57.84851837158203, + "objective/kl": 10.454719543457031, + "objective/non_score_reward": -1.0454717874526978, + "objective/rlhf_reward": -3.7818873882293698, + "objective/scores": 0.1, + "policy/approxkl_avg": 81.14009094238281, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6733952164649963, + "step": 106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979674816131592 + }, + { + "episode": 1728, + "epoch": 0.0310601430779739, + "loss/policy_avg": 0.3714882731437683, + "lr": 2.9794861963190184e-06, + "objective/entropy": -48.674835205078125, + "objective/kl": 8.692556381225586, + "objective/non_score_reward": -0.869255542755127, + "objective/rlhf_reward": -1.354315804616485, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 95.59877014160156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5013031959533691, + "step": 107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974138736724854 + }, + { + "episode": 1744, + "epoch": 0.03134773699536255, + "loss/policy_avg": 0.279630184173584, + "lr": 2.9792944785276076e-06, + "objective/entropy": 45.82620620727539, + "objective/kl": 6.540558338165283, + "objective/non_score_reward": -0.6540557742118835, + "objective/rlhf_reward": -4.616223335266113, + "objective/scores": -0.5, + "policy/approxkl_avg": 68.67784118652344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46648138761520386, + "step": 108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995229244232178 + }, + { + "episode": 1760, + "epoch": 0.0316353309127512, + "loss/policy_avg": 0.08991475403308868, + "lr": 2.9791027607361964e-06, + "objective/entropy": -64.23330688476562, + "objective/kl": 8.345191955566406, + "objective/non_score_reward": -0.8345192074775696, + "objective/rlhf_reward": -2.938076882064342, + "objective/scores": 0.1, + "policy/approxkl_avg": 93.55230712890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48790040612220764, + "step": 109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989550113677979 + }, + { + "episode": 1776, + "epoch": 0.031922924830139844, + "loss/policy_avg": 0.28548339009284973, + "lr": 2.9789110429447852e-06, + "objective/entropy": 43.4534912109375, + "objective/kl": 13.334844589233398, + "objective/non_score_reward": -1.333484411239624, + "objective/rlhf_reward": -4.933937734365463, + "objective/scores": 0.1, + "policy/approxkl_avg": 361.4084167480469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.721168041229248, + "step": 110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000476121902466 + }, + { + "episode": 1792, + "epoch": 0.03221051874752849, + "loss/policy_avg": 0.4107435941696167, + "lr": 2.9787193251533744e-06, + "objective/entropy": -182.18148803710938, + "objective/kl": 7.509696006774902, + "objective/non_score_reward": -0.750969648361206, + "objective/rlhf_reward": 1.396121428906918, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.65406036376953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5355631113052368, + "step": 111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999549388885498 + }, + { + "episode": 1808, + "epoch": 0.03249811266491714, + "loss/policy_avg": 0.3796160817146301, + "lr": 2.9785276073619633e-06, + "objective/entropy": 124.50952911376953, + "objective/kl": 8.484485626220703, + "objective/non_score_reward": -0.8484484553337097, + "objective/rlhf_reward": -5.393794059753418, + "objective/scores": -0.5, + "policy/approxkl_avg": 126.5394287109375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6620683670043945, + "step": 112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979877471923828 + }, + { + "episode": 1824, + "epoch": 0.032785706582305785, + "loss/policy_avg": 0.26249387860298157, + "lr": 2.978335889570552e-06, + "objective/entropy": 25.24797821044922, + "objective/kl": 13.403532028198242, + "objective/non_score_reward": -1.34035325050354, + "objective/rlhf_reward": -0.961412942409515, + "objective/scores": 1.1, + "policy/approxkl_avg": 187.52792358398438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6351895332336426, + "step": 113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988791942596436 + }, + { + "episode": 1840, + "epoch": 0.03307330049969443, + "loss/policy_avg": 0.3651992380619049, + "lr": 2.9781441717791413e-06, + "objective/entropy": 95.998779296875, + "objective/kl": 13.46788215637207, + "objective/non_score_reward": -1.3467882871627808, + "objective/rlhf_reward": -7.387153148651123, + "objective/scores": -0.5, + "policy/approxkl_avg": 247.23101806640625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6072109937667847, + "step": 114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9954397678375244 + }, + { + "episode": 1856, + "epoch": 0.03336089441708308, + "loss/policy_avg": 0.3738645017147064, + "lr": 2.97795245398773e-06, + "objective/entropy": 247.19410705566406, + "objective/kl": 14.608449935913086, + "objective/non_score_reward": -1.4608449935913086, + "objective/rlhf_reward": -7.843379974365234, + "objective/scores": -0.5, + "policy/approxkl_avg": 280.8343505859375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7322804927825928, + "step": 115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977314472198486 + }, + { + "episode": 1872, + "epoch": 0.033648488334471725, + "loss/policy_avg": 0.39061659574508667, + "lr": 2.9777607361963193e-06, + "objective/entropy": 58.74927520751953, + "objective/kl": 11.686922073364258, + "objective/non_score_reward": -1.1686923503875732, + "objective/rlhf_reward": -6.674769401550293, + "objective/scores": -0.5, + "policy/approxkl_avg": 226.12789916992188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48344868421554565, + "step": 116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999885559082031 + }, + { + "episode": 1888, + "epoch": 0.03393608225186037, + "loss/policy_avg": 0.3333742022514343, + "lr": 2.977569018404908e-06, + "objective/entropy": -19.94247055053711, + "objective/kl": 9.790740966796875, + "objective/non_score_reward": -0.979074239730835, + "objective/rlhf_reward": -0.9925778030764787, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 72.29800415039062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5458022952079773, + "step": 117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997154951095581 + }, + { + "episode": 1904, + "epoch": 0.03422367616924902, + "loss/policy_avg": 0.09615316987037659, + "lr": 2.977377300613497e-06, + "objective/entropy": -167.89923095703125, + "objective/kl": 8.815143585205078, + "objective/non_score_reward": -0.8815143704414368, + "objective/rlhf_reward": -3.1260574519634243, + "objective/scores": 0.1, + "policy/approxkl_avg": 45.062522888183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8456206321716309, + "step": 118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0000126361846924 + }, + { + "episode": 1920, + "epoch": 0.034511270086637666, + "loss/policy_avg": 0.08710992336273193, + "lr": 2.977185582822086e-06, + "objective/entropy": 263.03179931640625, + "objective/kl": 11.179251670837402, + "objective/non_score_reward": -1.1179251670837402, + "objective/rlhf_reward": -4.071700690686702, + "objective/scores": 0.1, + "policy/approxkl_avg": 161.03988647460938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7780265808105469, + "step": 119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994699954986572 + }, + { + "episode": 1936, + "epoch": 0.03479886400402631, + "loss/policy_avg": 0.13407912850379944, + "lr": 2.976993865030675e-06, + "objective/entropy": -70.0582504272461, + "objective/kl": 6.793869972229004, + "objective/non_score_reward": -0.6793869733810425, + "objective/rlhf_reward": 0.2061711356628213, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 59.43596649169922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7297772169113159, + "step": 120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998098611831665 + }, + { + "episode": 1952, + "epoch": 0.03508645792141496, + "loss/policy_avg": 0.06745412945747375, + "lr": 2.9768021472392642e-06, + "objective/entropy": 70.17347717285156, + "objective/kl": 9.706807136535645, + "objective/non_score_reward": -0.9706807136535645, + "objective/rlhf_reward": -1.9353115958737686, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 101.62091827392578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5260573625564575, + "step": 121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000791549682617 + }, + { + "episode": 1968, + "epoch": 0.03537405183880361, + "loss/policy_avg": 0.056749723851680756, + "lr": 2.976610429447853e-06, + "objective/entropy": -228.255615234375, + "objective/kl": 5.5145463943481445, + "objective/non_score_reward": -0.5514546632766724, + "objective/rlhf_reward": -1.8058185189962388, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.354915618896484, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6759341955184937, + "step": 122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004634857177734 + }, + { + "episode": 1984, + "epoch": 0.03566164575619225, + "loss/policy_avg": 0.08238844573497772, + "lr": 2.9764187116564414e-06, + "objective/entropy": 241.84060668945312, + "objective/kl": 9.057453155517578, + "objective/non_score_reward": -0.905745267868042, + "objective/rlhf_reward": -5.622981071472168, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.88520050048828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6288118362426758, + "step": 123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0009031295776367 + }, + { + "episode": 2000, + "epoch": 0.03594923967358091, + "loss/policy_avg": 0.5170639753341675, + "lr": 2.9762269938650307e-06, + "objective/entropy": -229.75860595703125, + "objective/kl": 6.897617816925049, + "objective/non_score_reward": -0.6897618174552917, + "objective/rlhf_reward": 0.1646718115198883, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 75.08253479003906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8155406713485718, + "step": 124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9993102550506592 + }, + { + "episode": 2016, + "epoch": 0.036236833590969554, + "loss/policy_avg": 0.23536163568496704, + "lr": 2.9760352760736195e-06, + "objective/entropy": 104.1327896118164, + "objective/kl": 8.855351448059082, + "objective/non_score_reward": -0.8855351209640503, + "objective/rlhf_reward": -0.6184214695703713, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 104.30943298339844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7742270231246948, + "step": 125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001922607421875 + }, + { + "episode": 2032, + "epoch": 0.0365244275083582, + "loss/policy_avg": 0.3929940164089203, + "lr": 2.9758435582822087e-06, + "objective/entropy": 19.527324676513672, + "objective/kl": 7.849102973937988, + "objective/non_score_reward": -0.7849102020263672, + "objective/rlhf_reward": -0.7396408528089523, + "objective/scores": 0.6, + "policy/approxkl_avg": 43.17992401123047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49572157859802246, + "step": 126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000518321990967 + }, + { + "episode": 2048, + "epoch": 0.03681202142574685, + "loss/policy_avg": 0.02761128917336464, + "lr": 2.9756518404907975e-06, + "objective/entropy": -31.197162628173828, + "objective/kl": 11.224246978759766, + "objective/non_score_reward": -1.122424602508545, + "objective/rlhf_reward": -6.48969841003418, + "objective/scores": -0.5, + "policy/approxkl_avg": 167.82305908203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6054507493972778, + "step": 127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998534917831421 + }, + { + "episode": 2064, + "epoch": 0.037099615343135495, + "loss/policy_avg": 0.8324223756790161, + "lr": 2.9754601226993863e-06, + "objective/entropy": -79.34418487548828, + "objective/kl": 9.576016426086426, + "objective/non_score_reward": -0.95760178565979, + "objective/rlhf_reward": -5.83040714263916, + "objective/scores": -0.5, + "policy/approxkl_avg": 189.614990234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6984212398529053, + "step": 128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9968839883804321 + }, + { + "episode": 2080, + "epoch": 0.03738720926052414, + "loss/policy_avg": 0.13773420453071594, + "lr": 2.9752684049079756e-06, + "objective/entropy": 29.27914047241211, + "objective/kl": 6.572282791137695, + "objective/non_score_reward": -0.6572283506393433, + "objective/rlhf_reward": -1.0247934012749966, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 35.22698974609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4145284593105316, + "step": 129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000413417816162 + }, + { + "episode": 2096, + "epoch": 0.03767480317791279, + "loss/policy_avg": 0.14625512063503265, + "lr": 2.9750766871165644e-06, + "objective/entropy": 248.92510986328125, + "objective/kl": 8.044551849365234, + "objective/non_score_reward": -0.8044552803039551, + "objective/rlhf_reward": -5.21782112121582, + "objective/scores": -0.5, + "policy/approxkl_avg": 65.03269958496094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7755135893821716, + "step": 130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9952176809310913 + }, + { + "episode": 2112, + "epoch": 0.037962397095301435, + "loss/policy_avg": 0.08024582266807556, + "lr": 2.9748849693251536e-06, + "objective/entropy": 122.44419860839844, + "objective/kl": 17.353057861328125, + "objective/non_score_reward": -1.735305666923523, + "objective/rlhf_reward": -6.541222697496414, + "objective/scores": 0.1, + "policy/approxkl_avg": 378.7543029785156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7671464681625366, + "step": 131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9952489137649536 + }, + { + "episode": 2128, + "epoch": 0.03824999101269008, + "loss/policy_avg": 0.052941206842660904, + "lr": 2.9746932515337424e-06, + "objective/entropy": 12.529216766357422, + "objective/kl": 17.990718841552734, + "objective/non_score_reward": -1.799072027206421, + "objective/rlhf_reward": -2.7962879896163937, + "objective/scores": 1.1, + "policy/approxkl_avg": 328.53741455078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.543836236000061, + "step": 132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990971088409424 + }, + { + "episode": 2144, + "epoch": 0.03853758493007873, + "loss/policy_avg": 0.08809210360050201, + "lr": 2.9745015337423312e-06, + "objective/entropy": 92.10977172851562, + "objective/kl": 3.5646705627441406, + "objective/non_score_reward": -0.35646697878837585, + "objective/rlhf_reward": 2.974132025241852, + "objective/scores": 1.1, + "policy/approxkl_avg": 13.450565338134766, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5211790800094604, + "step": 133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005075931549072 + }, + { + "episode": 2160, + "epoch": 0.038825178847467376, + "loss/policy_avg": 0.25067800283432007, + "lr": 2.9743098159509205e-06, + "objective/entropy": -342.741455078125, + "objective/kl": 13.730775833129883, + "objective/non_score_reward": -1.373077630996704, + "objective/rlhf_reward": -5.092310494184494, + "objective/scores": 0.1, + "policy/approxkl_avg": 226.23080444335938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7523109912872314, + "step": 134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9971081018447876 + }, + { + "episode": 2176, + "epoch": 0.03911277276485602, + "loss/policy_avg": -0.05823849141597748, + "lr": 2.9741180981595093e-06, + "objective/entropy": 177.39581298828125, + "objective/kl": 2.4112541675567627, + "objective/non_score_reward": -0.2411254346370697, + "objective/rlhf_reward": -2.9645018577575684, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.3866167068481445, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4305086135864258, + "step": 135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0287272930145264 + }, + { + "episode": 2192, + "epoch": 0.03940036668224467, + "loss/policy_avg": 0.39458757638931274, + "lr": 2.973926380368098e-06, + "objective/entropy": -71.98441314697266, + "objective/kl": 11.294361114501953, + "objective/non_score_reward": -1.1294360160827637, + "objective/rlhf_reward": -1.5940250202429025, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 172.00338745117188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7612372040748596, + "step": 136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9963352680206299 + }, + { + "episode": 2208, + "epoch": 0.039687960599633317, + "loss/policy_avg": 0.6298972368240356, + "lr": 2.9737346625766873e-06, + "objective/entropy": -57.89506530761719, + "objective/kl": 8.208264350891113, + "objective/non_score_reward": -0.8208264112472534, + "objective/rlhf_reward": -2.8833055704832073, + "objective/scores": 0.1, + "policy/approxkl_avg": 89.37754821777344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.817412257194519, + "step": 137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989038705825806 + }, + { + "episode": 2224, + "epoch": 0.03997555451702196, + "loss/policy_avg": 0.07556813955307007, + "lr": 2.973542944785276e-06, + "objective/entropy": -190.96238708496094, + "objective/kl": 15.059877395629883, + "objective/non_score_reward": -1.5059877634048462, + "objective/rlhf_reward": -1.6239511057734486, + "objective/scores": 1.1, + "policy/approxkl_avg": 377.60052490234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6793420314788818, + "step": 138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978358745574951 + }, + { + "episode": 2240, + "epoch": 0.04026314843441061, + "loss/policy_avg": 0.1697106957435608, + "lr": 2.9733512269938653e-06, + "objective/entropy": 53.708675384521484, + "objective/kl": 8.52203369140625, + "objective/non_score_reward": -0.8522033095359802, + "objective/rlhf_reward": -3.0088131859898564, + "objective/scores": 0.1, + "policy/approxkl_avg": 61.883544921875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5295162796974182, + "step": 139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995434284210205 + }, + { + "episode": 2256, + "epoch": 0.04055074235179926, + "loss/policy_avg": 0.4851709008216858, + "lr": 2.973159509202454e-06, + "objective/entropy": -114.12245178222656, + "objective/kl": 8.938103675842285, + "objective/non_score_reward": -0.8938103914260864, + "objective/rlhf_reward": -3.1752414911985394, + "objective/scores": 0.1, + "policy/approxkl_avg": 119.33918762207031, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.567252516746521, + "step": 140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977459907531738 + }, + { + "episode": 2272, + "epoch": 0.040838336269187904, + "loss/policy_avg": 0.011087119579315186, + "lr": 2.972967791411043e-06, + "objective/entropy": -39.64904022216797, + "objective/kl": 10.655853271484375, + "objective/non_score_reward": -1.0655853748321533, + "objective/rlhf_reward": -2.139635088221107, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 89.38186645507812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6346065998077393, + "step": 141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0012426376342773 + }, + { + "episode": 2288, + "epoch": 0.04112593018657655, + "loss/policy_avg": 0.18634071946144104, + "lr": 2.972776073619632e-06, + "objective/entropy": -234.11532592773438, + "objective/kl": 9.971248626708984, + "objective/non_score_reward": -0.9971247911453247, + "objective/rlhf_reward": -5.988499641418457, + "objective/scores": -0.5, + "policy/approxkl_avg": 161.76507568359375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6083584427833557, + "step": 142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975717067718506 + }, + { + "episode": 2304, + "epoch": 0.0414135241039652, + "loss/policy_avg": 0.842505693435669, + "lr": 2.972584355828221e-06, + "objective/entropy": 146.90762329101562, + "objective/kl": 12.336867332458496, + "objective/non_score_reward": -1.2336868047714233, + "objective/rlhf_reward": -4.534747010469436, + "objective/scores": 0.1, + "policy/approxkl_avg": 239.418701171875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4192795753479004, + "step": 143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997382164001465 + }, + { + "episode": 2320, + "epoch": 0.04170111802135385, + "loss/policy_avg": 0.6685837507247925, + "lr": 2.9723926380368102e-06, + "objective/entropy": 41.359195709228516, + "objective/kl": 9.3118896484375, + "objective/non_score_reward": -0.9311891794204712, + "objective/rlhf_reward": 0.6752433419227604, + "objective/scores": 1.1, + "policy/approxkl_avg": 151.73721313476562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5029563307762146, + "step": 144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993720054626465 + }, + { + "episode": 2336, + "epoch": 0.0419887119387425, + "loss/policy_avg": 0.2414681613445282, + "lr": 2.9722009202453986e-06, + "objective/entropy": -22.124481201171875, + "objective/kl": 10.415138244628906, + "objective/non_score_reward": -1.0415138006210327, + "objective/rlhf_reward": -3.7660551875829693, + "objective/scores": 0.1, + "policy/approxkl_avg": 117.5968246459961, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5334524512290955, + "step": 145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998509407043457 + }, + { + "episode": 2352, + "epoch": 0.042276305856131145, + "loss/policy_avg": -0.1104317232966423, + "lr": 2.972009202453988e-06, + "objective/entropy": 89.73234558105469, + "objective/kl": 9.888954162597656, + "objective/non_score_reward": -0.9888954162597656, + "objective/rlhf_reward": -3.5555818438529965, + "objective/scores": 0.1, + "policy/approxkl_avg": 113.99063110351562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5904539823532104, + "step": 146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002488851547241 + }, + { + "episode": 2368, + "epoch": 0.04256389977351979, + "loss/policy_avg": 0.6591033935546875, + "lr": 2.9718174846625767e-06, + "objective/entropy": 149.4038543701172, + "objective/kl": 10.047257423400879, + "objective/non_score_reward": -1.0047259330749512, + "objective/rlhf_reward": -6.018903732299805, + "objective/scores": -0.5, + "policy/approxkl_avg": 178.25811767578125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9356397986412048, + "step": 147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997607707977295 + }, + { + "episode": 2384, + "epoch": 0.04285149369090844, + "loss/policy_avg": 0.544201135635376, + "lr": 2.9716257668711655e-06, + "objective/entropy": 132.01980590820312, + "objective/kl": 9.59277629852295, + "objective/non_score_reward": -0.9592776298522949, + "objective/rlhf_reward": -3.437110504508018, + "objective/scores": 0.1, + "policy/approxkl_avg": 97.24102783203125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8296840190887451, + "step": 148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0014381408691406 + }, + { + "episode": 2400, + "epoch": 0.043139087608297086, + "loss/policy_avg": 0.36106303334236145, + "lr": 2.9714340490797547e-06, + "objective/entropy": 260.59033203125, + "objective/kl": 11.327485084533691, + "objective/non_score_reward": -1.1327484846115112, + "objective/rlhf_reward": -2.4082878849664073, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 112.34231567382812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6541630029678345, + "step": 149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988218545913696 + }, + { + "episode": 2416, + "epoch": 0.04342668152568573, + "loss/policy_avg": 0.30818748474121094, + "lr": 2.9712423312883435e-06, + "objective/entropy": 167.4129180908203, + "objective/kl": 9.699304580688477, + "objective/non_score_reward": -0.9699304103851318, + "objective/rlhf_reward": -5.879721641540527, + "objective/scores": -0.5, + "policy/approxkl_avg": 82.35363006591797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6681854724884033, + "step": 150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9972755908966064 + }, + { + "episode": 2432, + "epoch": 0.04371427544307438, + "loss/policy_avg": 0.6923952102661133, + "lr": 2.9710506134969323e-06, + "objective/entropy": 48.51850128173828, + "objective/kl": 7.859927177429199, + "objective/non_score_reward": -0.7859926223754883, + "objective/rlhf_reward": -2.743970593810081, + "objective/scores": 0.1, + "policy/approxkl_avg": 103.42765808105469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7212280035018921, + "step": 151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981834888458252 + }, + { + "episode": 2448, + "epoch": 0.044001869360463026, + "loss/policy_avg": 0.11756162345409393, + "lr": 2.9708588957055216e-06, + "objective/entropy": -18.197547912597656, + "objective/kl": 9.619758605957031, + "objective/non_score_reward": -0.9619758129119873, + "objective/rlhf_reward": -2.243783268992024, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 92.98655700683594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6553933620452881, + "step": 152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9970009326934814 + }, + { + "episode": 2464, + "epoch": 0.04428946327785167, + "loss/policy_avg": 0.1137843132019043, + "lr": 2.9706671779141104e-06, + "objective/entropy": 120.47866821289062, + "objective/kl": 12.719396591186523, + "objective/non_score_reward": -1.2719398736953735, + "objective/rlhf_reward": -4.687759546935558, + "objective/scores": 0.1, + "policy/approxkl_avg": 160.37051391601562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5731196403503418, + "step": 153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005669593811035 + }, + { + "episode": 2480, + "epoch": 0.04457705719524032, + "loss/policy_avg": 0.13933295011520386, + "lr": 2.9704754601226996e-06, + "objective/entropy": -150.15121459960938, + "objective/kl": 5.141759395599365, + "objective/non_score_reward": -0.5141758918762207, + "objective/rlhf_reward": -4.056703567504883, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.49239730834961, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7959345579147339, + "step": 154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0006468296051025 + }, + { + "episode": 2496, + "epoch": 0.04486465111262897, + "loss/policy_avg": 0.0854576975107193, + "lr": 2.9702837423312884e-06, + "objective/entropy": 143.63348388671875, + "objective/kl": 11.670942306518555, + "objective/non_score_reward": -1.1670942306518555, + "objective/rlhf_reward": -4.26837727278471, + "objective/scores": 0.1, + "policy/approxkl_avg": 232.97808837890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9138456583023071, + "step": 155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9975340366363525 + }, + { + "episode": 2512, + "epoch": 0.045152245030017614, + "loss/policy_avg": 0.026141434907913208, + "lr": 2.9700920245398772e-06, + "objective/entropy": -4.6529083251953125, + "objective/kl": 7.081835746765137, + "objective/non_score_reward": -0.7081836462020874, + "objective/rlhf_reward": -4.832734107971191, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.927255630493164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6679246425628662, + "step": 156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991931915283203 + }, + { + "episode": 2528, + "epoch": 0.04543983894740626, + "loss/policy_avg": 0.15662409365177155, + "lr": 2.9699003067484665e-06, + "objective/entropy": 175.9498291015625, + "objective/kl": 6.773474216461182, + "objective/non_score_reward": -0.6773474216461182, + "objective/rlhf_reward": -2.3093897461891175, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.37593078613281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.47304588556289673, + "step": 157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973795413970947 + }, + { + "episode": 2544, + "epoch": 0.04572743286479491, + "loss/policy_avg": 0.23720163106918335, + "lr": 2.9697085889570553e-06, + "objective/entropy": -89.0332260131836, + "objective/kl": 8.619776725769043, + "objective/non_score_reward": -0.8619776964187622, + "objective/rlhf_reward": -0.5241918011915412, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 29.201887130737305, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6532964110374451, + "step": 158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988105297088623 + }, + { + "episode": 2560, + "epoch": 0.046015026782183555, + "loss/policy_avg": 0.2267841100692749, + "lr": 2.9695168711656445e-06, + "objective/entropy": 1.6421661376953125, + "objective/kl": 9.304061889648438, + "objective/non_score_reward": -0.9304060935974121, + "objective/rlhf_reward": 0.6783757746219639, + "objective/scores": 1.1, + "policy/approxkl_avg": 34.62976837158203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7413831949234009, + "step": 159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987226724624634 + }, + { + "episode": 2576, + "epoch": 0.0463026206995722, + "loss/policy_avg": 0.5093711614608765, + "lr": 2.9693251533742333e-06, + "objective/entropy": -36.8858528137207, + "objective/kl": 11.298078536987305, + "objective/non_score_reward": -1.1298078298568726, + "objective/rlhf_reward": -6.51923131942749, + "objective/scores": -0.5, + "policy/approxkl_avg": 135.19918823242188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6970053911209106, + "step": 160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0019335746765137 + }, + { + "episode": 2592, + "epoch": 0.04659021461696085, + "loss/policy_avg": 0.17401546239852905, + "lr": 2.969133435582822e-06, + "objective/entropy": 172.7603759765625, + "objective/kl": 9.036718368530273, + "objective/non_score_reward": -0.9036718606948853, + "objective/rlhf_reward": -3.2146874427795407, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.195972442626953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5838289260864258, + "step": 161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981424808502197 + }, + { + "episode": 2608, + "epoch": 0.046877808534349495, + "loss/policy_avg": 0.44493553042411804, + "lr": 2.9689417177914114e-06, + "objective/entropy": -19.54338836669922, + "objective/kl": 11.437591552734375, + "objective/non_score_reward": -1.1437591314315796, + "objective/rlhf_reward": -6.57503604888916, + "objective/scores": -0.5, + "policy/approxkl_avg": 146.39810180664062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.468772292137146, + "step": 162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998663067817688 + }, + { + "episode": 2624, + "epoch": 0.04716540245173815, + "loss/policy_avg": 2.674943447113037, + "lr": 2.96875e-06, + "objective/entropy": -46.63656234741211, + "objective/kl": 3.6555447578430176, + "objective/non_score_reward": -0.3655545115470886, + "objective/rlhf_reward": 0.48519318274981194, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 8.321266174316406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8199789524078369, + "step": 163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0079751014709473 + }, + { + "episode": 2640, + "epoch": 0.047452996369126796, + "loss/policy_avg": 0.1934666782617569, + "lr": 2.968558282208589e-06, + "objective/entropy": -81.56670379638672, + "objective/kl": 12.988276481628418, + "objective/non_score_reward": -1.2988277673721313, + "objective/rlhf_reward": -2.2715921520602436, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 293.0709228515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7369424104690552, + "step": 164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997572898864746 + }, + { + "episode": 2656, + "epoch": 0.04774059028651544, + "loss/policy_avg": 0.15999506413936615, + "lr": 2.968366564417178e-06, + "objective/entropy": 46.440673828125, + "objective/kl": 13.255866050720215, + "objective/non_score_reward": -1.3255865573883057, + "objective/rlhf_reward": -7.302346229553223, + "objective/scores": -0.5, + "policy/approxkl_avg": 191.44906616210938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8060017824172974, + "step": 165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992680549621582 + }, + { + "episode": 2672, + "epoch": 0.04802818420390409, + "loss/policy_avg": -0.11308600753545761, + "lr": 2.968174846625767e-06, + "objective/entropy": 229.09007263183594, + "objective/kl": 10.140392303466797, + "objective/non_score_reward": -1.0140392780303955, + "objective/rlhf_reward": -3.6561572611331936, + "objective/scores": 0.1, + "policy/approxkl_avg": 81.9848861694336, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8131240606307983, + "step": 166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0012285709381104 + }, + { + "episode": 2688, + "epoch": 0.048315778121292736, + "loss/policy_avg": 0.2457667887210846, + "lr": 2.967983128834356e-06, + "objective/entropy": -224.82394409179688, + "objective/kl": 5.642745018005371, + "objective/non_score_reward": -0.5642745494842529, + "objective/rlhf_reward": 0.666621024965075, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 43.03296661376953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.596450686454773, + "step": 167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9965918064117432 + }, + { + "episode": 2704, + "epoch": 0.04860337203868138, + "loss/policy_avg": 0.16231290996074677, + "lr": 2.9677914110429446e-06, + "objective/entropy": 53.96503829956055, + "objective/kl": 9.837738037109375, + "objective/non_score_reward": -0.9837738275527954, + "objective/rlhf_reward": -5.935094833374023, + "objective/scores": -0.5, + "policy/approxkl_avg": 144.24468994140625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5579663515090942, + "step": 168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979324340820312 + }, + { + "episode": 2720, + "epoch": 0.04889096595607003, + "loss/policy_avg": 0.5551764369010925, + "lr": 2.967599693251534e-06, + "objective/entropy": -89.17186737060547, + "objective/kl": 4.356380939483643, + "objective/non_score_reward": -0.4356381893157959, + "objective/rlhf_reward": 0.08227607312529184, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 33.07392120361328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44729527831077576, + "step": 169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999492168426514 + }, + { + "episode": 2736, + "epoch": 0.04917855987345868, + "loss/policy_avg": 0.4932054579257965, + "lr": 2.9674079754601227e-06, + "objective/entropy": -11.498092651367188, + "objective/kl": 10.226805686950684, + "objective/non_score_reward": -1.022680401802063, + "objective/rlhf_reward": -1.1670026972305505, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 102.75898742675781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6330607533454895, + "step": 170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996318817138672 + }, + { + "episode": 2752, + "epoch": 0.049466153790847324, + "loss/policy_avg": 0.1898762285709381, + "lr": 2.9672162576687115e-06, + "objective/entropy": -70.60189056396484, + "objective/kl": 9.331042289733887, + "objective/non_score_reward": -0.9331042766571045, + "objective/rlhf_reward": -3.3324170172214505, + "objective/scores": 0.1, + "policy/approxkl_avg": 114.848388671875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6309188604354858, + "step": 171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009915828704834 + }, + { + "episode": 2768, + "epoch": 0.04975374770823597, + "loss/policy_avg": 0.2297024428844452, + "lr": 2.9670245398773007e-06, + "objective/entropy": 16.196762084960938, + "objective/kl": 11.903242111206055, + "objective/non_score_reward": -1.1903241872787476, + "objective/rlhf_reward": -2.638590636030708, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 139.62954711914062, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.705674946308136, + "step": 172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981346130371094 + }, + { + "episode": 2784, + "epoch": 0.05004134162562462, + "loss/policy_avg": 0.39737239480018616, + "lr": 2.9668328220858895e-06, + "objective/entropy": -89.89057922363281, + "objective/kl": 7.809091567993164, + "objective/non_score_reward": -0.7809092402458191, + "objective/rlhf_reward": -5.123636722564697, + "objective/scores": -0.5, + "policy/approxkl_avg": 93.33058166503906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.9152443408966064, + "step": 173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9968818426132202 + }, + { + "episode": 2800, + "epoch": 0.050328935543013265, + "loss/policy_avg": 0.06229601055383682, + "lr": 2.9666411042944783e-06, + "objective/entropy": 105.8713607788086, + "objective/kl": 10.667573928833008, + "objective/non_score_reward": -1.0667574405670166, + "objective/rlhf_reward": -2.6629095709958843, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 72.16740417480469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5240480899810791, + "step": 174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984283447265625 + }, + { + "episode": 2816, + "epoch": 0.05061652946040191, + "loss/policy_avg": 0.23533995449543, + "lr": 2.9664493865030676e-06, + "objective/entropy": 73.5090103149414, + "objective/kl": 8.236711502075195, + "objective/non_score_reward": -0.8236711621284485, + "objective/rlhf_reward": 1.1053153514862064, + "objective/scores": 1.1, + "policy/approxkl_avg": 86.34696197509766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.549429714679718, + "step": 175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980177879333496 + }, + { + "episode": 2832, + "epoch": 0.05090412337779056, + "loss/policy_avg": 0.23864325881004333, + "lr": 2.9662576687116564e-06, + "objective/entropy": 44.60013198852539, + "objective/kl": 11.681440353393555, + "objective/non_score_reward": -1.1681439876556396, + "objective/rlhf_reward": -4.272575950622558, + "objective/scores": 0.1, + "policy/approxkl_avg": 141.5290985107422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6180237531661987, + "step": 176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996826410293579 + }, + { + "episode": 2848, + "epoch": 0.051191717295179205, + "loss/policy_avg": 0.12463901191949844, + "lr": 2.9660659509202456e-06, + "objective/entropy": -185.71621704101562, + "objective/kl": 6.215152263641357, + "objective/non_score_reward": -0.6215152740478516, + "objective/rlhf_reward": 1.9139389447867874, + "objective/scores": 1.1, + "policy/approxkl_avg": 31.746864318847656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6655900478363037, + "step": 177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984078407287598 + }, + { + "episode": 2864, + "epoch": 0.05147931121256785, + "loss/policy_avg": 0.41434115171432495, + "lr": 2.9658742331288344e-06, + "objective/entropy": 125.62101745605469, + "objective/kl": 9.051776885986328, + "objective/non_score_reward": -0.9051777124404907, + "objective/rlhf_reward": -1.2207108795642851, + "objective/scores": 0.6, + "policy/approxkl_avg": 89.37495422363281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6031736731529236, + "step": 178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9960637092590332 + }, + { + "episode": 2880, + "epoch": 0.0517669051299565, + "loss/policy_avg": 0.0819600522518158, + "lr": 2.9656825153374232e-06, + "objective/entropy": 167.4649658203125, + "objective/kl": 11.02088737487793, + "objective/non_score_reward": -1.1020888090133667, + "objective/rlhf_reward": -2.7464959226256473, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 87.59506225585938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5513530969619751, + "step": 179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990684986114502 + }, + { + "episode": 2896, + "epoch": 0.052054499047345146, + "loss/policy_avg": 0.28252676129341125, + "lr": 2.9654907975460125e-06, + "objective/entropy": -71.35255432128906, + "objective/kl": 6.714944362640381, + "objective/non_score_reward": -0.6714943647384644, + "objective/rlhf_reward": 1.7140224814414982, + "objective/scores": 1.1, + "policy/approxkl_avg": 61.805137634277344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5994826555252075, + "step": 180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976210594177246 + }, + { + "episode": 2912, + "epoch": 0.05234209296473379, + "loss/policy_avg": 0.0703403651714325, + "lr": 2.9652990797546013e-06, + "objective/entropy": -239.35452270507812, + "objective/kl": 10.807499885559082, + "objective/non_score_reward": -1.0807499885559082, + "objective/rlhf_reward": -6.322999954223633, + "objective/scores": -0.5, + "policy/approxkl_avg": 269.2040710449219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.626622200012207, + "step": 181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9963631629943848 + }, + { + "episode": 2928, + "epoch": 0.052629686882122446, + "loss/policy_avg": 0.5756047964096069, + "lr": 2.9651073619631905e-06, + "objective/entropy": 81.2969741821289, + "objective/kl": 9.503179550170898, + "objective/non_score_reward": -0.9503180384635925, + "objective/rlhf_reward": -1.6785659066596368, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 69.6463394165039, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4174951910972595, + "step": 182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000248432159424 + }, + { + "episode": 2944, + "epoch": 0.05291728079951109, + "loss/policy_avg": 0.19677188992500305, + "lr": 2.9649156441717793e-06, + "objective/entropy": 97.329345703125, + "objective/kl": 7.023814678192139, + "objective/non_score_reward": -0.702381432056427, + "objective/rlhf_reward": -4.809525966644287, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.869384765625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5866813659667969, + "step": 183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983813762664795 + }, + { + "episode": 2960, + "epoch": 0.05320487471689974, + "loss/policy_avg": -0.09021998941898346, + "lr": 2.964723926380368e-06, + "objective/entropy": -104.08444213867188, + "objective/kl": 8.810539245605469, + "objective/non_score_reward": -0.8810538649559021, + "objective/rlhf_reward": -3.1242154896259304, + "objective/scores": 0.1, + "policy/approxkl_avg": 64.01618957519531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6541459560394287, + "step": 184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9997464418411255 + }, + { + "episode": 2976, + "epoch": 0.05349246863428839, + "loss/policy_avg": 0.7093124389648438, + "lr": 2.9645322085889574e-06, + "objective/entropy": -2.2389583587646484, + "objective/kl": 12.9491548538208, + "objective/non_score_reward": -1.2949154376983643, + "objective/rlhf_reward": -3.056955727116142, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 159.49282836914062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5769963264465332, + "step": 185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998090386390686 + }, + { + "episode": 2992, + "epoch": 0.053780062551677034, + "loss/policy_avg": 0.6172127723693848, + "lr": 2.964340490797546e-06, + "objective/entropy": 96.50690460205078, + "objective/kl": 9.217771530151367, + "objective/non_score_reward": -0.9217771291732788, + "objective/rlhf_reward": 0.7128913417458538, + "objective/scores": 1.1, + "policy/approxkl_avg": 116.59093475341797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5699312686920166, + "step": 186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982131719589233 + }, + { + "episode": 3008, + "epoch": 0.05406765646906568, + "loss/policy_avg": 0.1616460084915161, + "lr": 2.964148773006135e-06, + "objective/entropy": -13.621841430664062, + "objective/kl": 11.028844833374023, + "objective/non_score_reward": -1.1028845310211182, + "objective/rlhf_reward": -2.8074182084837727, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 125.17231750488281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6993151903152466, + "step": 187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992413520812988 + }, + { + "episode": 3024, + "epoch": 0.05435525038645433, + "loss/policy_avg": -0.022246820852160454, + "lr": 2.9639570552147242e-06, + "objective/entropy": -95.69093322753906, + "objective/kl": 8.957221031188965, + "objective/non_score_reward": -0.8957222700119019, + "objective/rlhf_reward": -3.1828890204429623, + "objective/scores": 0.1, + "policy/approxkl_avg": 129.6190948486328, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7190344333648682, + "step": 188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0001869201660156 + }, + { + "episode": 3040, + "epoch": 0.054642844303842975, + "loss/policy_avg": 0.3806031346321106, + "lr": 2.9637653374233126e-06, + "objective/entropy": 31.125537872314453, + "objective/kl": 14.289737701416016, + "objective/non_score_reward": -1.428973913192749, + "objective/rlhf_reward": -5.315895533561706, + "objective/scores": 0.1, + "policy/approxkl_avg": 252.6319122314453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4689924716949463, + "step": 189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998744010925293 + }, + { + "episode": 3056, + "epoch": 0.05493043822123162, + "loss/policy_avg": 0.38266557455062866, + "lr": 2.963573619631902e-06, + "objective/entropy": -70.03289794921875, + "objective/kl": 11.780672073364258, + "objective/non_score_reward": -1.1780673265457153, + "objective/rlhf_reward": -1.7885502024900646, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 211.99916076660156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.736452043056488, + "step": 190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998718023300171 + }, + { + "episode": 3072, + "epoch": 0.05521803213862027, + "loss/policy_avg": -0.1702420711517334, + "lr": 2.9633819018404906e-06, + "objective/entropy": -88.13114166259766, + "objective/kl": 5.207786560058594, + "objective/non_score_reward": -0.5207787156105042, + "objective/rlhf_reward": -0.13570352919572182, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 66.82286834716797, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6916394233703613, + "step": 191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0128471851348877 + }, + { + "episode": 3088, + "epoch": 0.055505626056008915, + "loss/policy_avg": 0.47809040546417236, + "lr": 2.96319018404908e-06, + "objective/entropy": -146.3225555419922, + "objective/kl": 9.557235717773438, + "objective/non_score_reward": -0.9557235240936279, + "objective/rlhf_reward": -1.8754830089973764, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 62.17351531982422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6510109901428223, + "step": 192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980770349502563 + }, + { + "episode": 3104, + "epoch": 0.05579321997339756, + "loss/policy_avg": 0.695833683013916, + "lr": 2.9629984662576687e-06, + "objective/entropy": 100.23960876464844, + "objective/kl": 9.5051908493042, + "objective/non_score_reward": -0.9505190849304199, + "objective/rlhf_reward": -3.4020764887332913, + "objective/scores": 0.1, + "policy/approxkl_avg": 92.06460571289062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7171235084533691, + "step": 193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999764084815979 + }, + { + "episode": 3120, + "epoch": 0.05608081389078621, + "loss/policy_avg": 0.16662254929542542, + "lr": 2.9628067484662575e-06, + "objective/entropy": 166.84793090820312, + "objective/kl": 12.172272682189941, + "objective/non_score_reward": -1.2172273397445679, + "objective/rlhf_reward": -4.4689091391861435, + "objective/scores": 0.1, + "policy/approxkl_avg": 133.61741638183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5888961553573608, + "step": 194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996709823608398 + }, + { + "episode": 3136, + "epoch": 0.056368407808174856, + "loss/policy_avg": 0.7391001582145691, + "lr": 2.9626150306748467e-06, + "objective/entropy": 7.757408142089844, + "objective/kl": 4.110318183898926, + "objective/non_score_reward": -0.4110318422317505, + "objective/rlhf_reward": 2.755872675776482, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.918201446533203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6208719611167908, + "step": 195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999298095703125 + }, + { + "episode": 3152, + "epoch": 0.0566560017255635, + "loss/policy_avg": -0.04099520295858383, + "lr": 2.9624233128834355e-06, + "objective/entropy": -164.4539337158203, + "objective/kl": 7.288479328155518, + "objective/non_score_reward": -0.7288479804992676, + "objective/rlhf_reward": 1.4846080929040912, + "objective/scores": 1.1, + "policy/approxkl_avg": 88.62603759765625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7371472120285034, + "step": 196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998602867126465 + }, + { + "episode": 3168, + "epoch": 0.05694359564295215, + "loss/policy_avg": 0.10230283439159393, + "lr": 2.9622315950920248e-06, + "objective/entropy": -6.653453826904297, + "objective/kl": 10.225364685058594, + "objective/non_score_reward": -1.0225365161895752, + "objective/rlhf_reward": -2.265317554744791, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 116.62469482421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6673995852470398, + "step": 197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9955484867095947 + }, + { + "episode": 3184, + "epoch": 0.057231189560340796, + "loss/policy_avg": 0.1610383689403534, + "lr": 2.9620398773006136e-06, + "objective/entropy": 83.79240417480469, + "objective/kl": 6.000893592834473, + "objective/non_score_reward": -0.6000893712043762, + "objective/rlhf_reward": -2.000357484817505, + "objective/scores": 0.1, + "policy/approxkl_avg": 38.31947326660156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49990415573120117, + "step": 198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981744289398193 + }, + { + "episode": 3200, + "epoch": 0.05751878347772944, + "loss/policy_avg": 2.8004019260406494, + "lr": 2.9618481595092024e-06, + "objective/entropy": 105.93596649169922, + "objective/kl": 9.928138732910156, + "objective/non_score_reward": -0.9928138852119446, + "objective/rlhf_reward": 0.4287445038557056, + "objective/scores": 1.1, + "policy/approxkl_avg": 68.9993667602539, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5279573202133179, + "step": 199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0012660026550293 + }, + { + "episode": 3216, + "epoch": 0.05780637739511809, + "loss/policy_avg": 0.1186942383646965, + "lr": 2.9616564417177916e-06, + "objective/entropy": 176.61386108398438, + "objective/kl": 15.481854438781738, + "objective/non_score_reward": -1.5481854677200317, + "objective/rlhf_reward": -8.192741394042969, + "objective/scores": -0.5, + "policy/approxkl_avg": 287.8005676269531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8114917278289795, + "step": 200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980297088623047 + }, + { + "episode": 3232, + "epoch": 0.058093971312506744, + "loss/policy_avg": 0.12549322843551636, + "lr": 2.9614647239263804e-06, + "objective/entropy": -43.08008575439453, + "objective/kl": 1.7479121685028076, + "objective/non_score_reward": -0.17479124665260315, + "objective/rlhf_reward": 1.1256637878987084, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 1.0369627475738525, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6038674116134644, + "step": 201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001643657684326 + }, + { + "episode": 3248, + "epoch": 0.05838156522989539, + "loss/policy_avg": 0.37325456738471985, + "lr": 2.9612730061349692e-06, + "objective/entropy": -55.67109680175781, + "objective/kl": 16.567649841308594, + "objective/non_score_reward": -1.6567649841308594, + "objective/rlhf_reward": -8.627059936523438, + "objective/scores": -0.5, + "policy/approxkl_avg": 345.91241455078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.688873291015625, + "step": 202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9956648349761963 + }, + { + "episode": 3264, + "epoch": 0.05866915914728404, + "loss/policy_avg": 0.4203230142593384, + "lr": 2.9610812883435585e-06, + "objective/entropy": -102.62028503417969, + "objective/kl": 8.865509033203125, + "objective/non_score_reward": -0.8865509629249573, + "objective/rlhf_reward": -0.622484960348579, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 93.6322021484375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6493798494338989, + "step": 203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986227750778198 + }, + { + "episode": 3280, + "epoch": 0.058956753064672685, + "loss/policy_avg": 0.6964031457901001, + "lr": 2.9608895705521473e-06, + "objective/entropy": 110.6749496459961, + "objective/kl": 17.85771369934082, + "objective/non_score_reward": -1.785771131515503, + "objective/rlhf_reward": -9.143084526062012, + "objective/scores": -0.5, + "policy/approxkl_avg": 490.62164306640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6564576625823975, + "step": 204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979726076126099 + }, + { + "episode": 3296, + "epoch": 0.05924434698206133, + "loss/policy_avg": 0.2527036964893341, + "lr": 2.9606978527607365e-06, + "objective/entropy": -116.59994506835938, + "objective/kl": 11.40339469909668, + "objective/non_score_reward": -1.1403393745422363, + "objective/rlhf_reward": -2.161357662081718, + "objective/scores": 0.6, + "policy/approxkl_avg": 91.01628112792969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6173946857452393, + "step": 205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9987362623214722 + }, + { + "episode": 3312, + "epoch": 0.05953194089944998, + "loss/policy_avg": 0.2560387849807739, + "lr": 2.9605061349693253e-06, + "objective/entropy": -39.499755859375, + "objective/kl": 6.948145866394043, + "objective/non_score_reward": -0.6948145627975464, + "objective/rlhf_reward": 1.6207415699958805, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.67678833007812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6708812713623047, + "step": 206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997892141342163 + }, + { + "episode": 3328, + "epoch": 0.059819534816838625, + "loss/policy_avg": 0.27111512422561646, + "lr": 2.960314417177914e-06, + "objective/entropy": 84.36082458496094, + "objective/kl": 7.624824523925781, + "objective/non_score_reward": -0.7624824643135071, + "objective/rlhf_reward": -5.049929618835449, + "objective/scores": -0.5, + "policy/approxkl_avg": 129.3857879638672, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.859719455242157, + "step": 207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986274242401123 + }, + { + "episode": 3344, + "epoch": 0.06010712873422727, + "loss/policy_avg": 0.3739089071750641, + "lr": 2.9601226993865034e-06, + "objective/entropy": -137.59747314453125, + "objective/kl": 7.363832950592041, + "objective/non_score_reward": -0.7363832592964172, + "objective/rlhf_reward": -2.5455331265926358, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.08015441894531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6479380130767822, + "step": 208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974188804626465 + }, + { + "episode": 3360, + "epoch": 0.06039472265161592, + "loss/policy_avg": 0.2446056306362152, + "lr": 2.959930981595092e-06, + "objective/entropy": 87.66815185546875, + "objective/kl": 6.449171543121338, + "objective/non_score_reward": -0.6449171900749207, + "objective/rlhf_reward": -2.1796687602996823, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.2250862121582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4830004572868347, + "step": 209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996240139007568 + }, + { + "episode": 3376, + "epoch": 0.060682316569004566, + "loss/policy_avg": 0.19204621016979218, + "lr": 2.9597392638036814e-06, + "objective/entropy": 67.65581512451172, + "objective/kl": 9.747137069702148, + "objective/non_score_reward": -0.9747136831283569, + "objective/rlhf_reward": -3.4988546952605244, + "objective/scores": 0.1, + "policy/approxkl_avg": 113.63888549804688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5157696604728699, + "step": 210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9994913339614868 + }, + { + "episode": 3392, + "epoch": 0.06096991048639321, + "loss/policy_avg": -0.3198900520801544, + "lr": 2.9595475460122702e-06, + "objective/entropy": 64.95191955566406, + "objective/kl": 6.794856071472168, + "objective/non_score_reward": -0.6794856190681458, + "objective/rlhf_reward": -2.317942655086517, + "objective/scores": 0.1, + "policy/approxkl_avg": 64.20761108398438, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6942774653434753, + "step": 211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.003568649291992 + }, + { + "episode": 3408, + "epoch": 0.06125750440378186, + "loss/policy_avg": 0.8114681839942932, + "lr": 2.959355828220859e-06, + "objective/entropy": 88.22362518310547, + "objective/kl": 10.00731372833252, + "objective/non_score_reward": -1.0007314682006836, + "objective/rlhf_reward": 0.39707423150539434, + "objective/scores": 1.1, + "policy/approxkl_avg": 100.05908203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5509470105171204, + "step": 212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995248317718506 + }, + { + "episode": 3424, + "epoch": 0.061545098321170506, + "loss/policy_avg": 0.3652896285057068, + "lr": 2.959164110429448e-06, + "objective/entropy": -220.09613037109375, + "objective/kl": 8.2984037399292, + "objective/non_score_reward": -0.8298404216766357, + "objective/rlhf_reward": -2.919361627101898, + "objective/scores": 0.1, + "policy/approxkl_avg": 93.64591979980469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5828021168708801, + "step": 213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000081777572632 + }, + { + "episode": 3440, + "epoch": 0.06183269223855915, + "loss/policy_avg": 0.028781473636627197, + "lr": 2.9589723926380366e-06, + "objective/entropy": 72.11604309082031, + "objective/kl": 2.359449625015259, + "objective/non_score_reward": -0.2359449863433838, + "objective/rlhf_reward": -0.5437799677252769, + "objective/scores": 0.1, + "policy/approxkl_avg": 0.3476608991622925, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4451746940612793, + "step": 214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0019702911376953 + }, + { + "episode": 3456, + "epoch": 0.0621202861559478, + "loss/policy_avg": 0.5238405466079712, + "lr": 2.958780674846626e-06, + "objective/entropy": 94.75743103027344, + "objective/kl": 12.506973266601562, + "objective/non_score_reward": -1.250697374343872, + "objective/rlhf_reward": -4.602789735794067, + "objective/scores": 0.1, + "policy/approxkl_avg": 134.81378173828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6946749091148376, + "step": 215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987130165100098 + }, + { + "episode": 3472, + "epoch": 0.06240788007333645, + "loss/policy_avg": 0.357485830783844, + "lr": 2.9585889570552147e-06, + "objective/entropy": -56.58507537841797, + "objective/kl": 15.03990364074707, + "objective/non_score_reward": -1.5039904117584229, + "objective/rlhf_reward": -4.068550298886235, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 188.71969604492188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5828025341033936, + "step": 216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9974546432495117 + }, + { + "episode": 3488, + "epoch": 0.0626954739907251, + "loss/policy_avg": 0.5269087553024292, + "lr": 2.9583972392638035e-06, + "objective/entropy": -54.57160949707031, + "objective/kl": 10.709373474121094, + "objective/non_score_reward": -1.0709375143051147, + "objective/rlhf_reward": -6.283750057220459, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.47734069824219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4404389560222626, + "step": 217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983222484588623 + }, + { + "episode": 3504, + "epoch": 0.06298306790811374, + "loss/policy_avg": 0.17589987814426422, + "lr": 2.9582055214723927e-06, + "objective/entropy": 64.53421020507812, + "objective/kl": 7.162571907043457, + "objective/non_score_reward": -0.7162571549415588, + "objective/rlhf_reward": -0.7423223874726631, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 121.216552734375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4267624020576477, + "step": 218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995992183685303 + }, + { + "episode": 3520, + "epoch": 0.0632706618255024, + "loss/policy_avg": 0.26339495182037354, + "lr": 2.9580138036809815e-06, + "objective/entropy": 229.096435546875, + "objective/kl": 11.569236755371094, + "objective/non_score_reward": -1.156923532485962, + "objective/rlhf_reward": -1.7039751603615017, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 217.75863647460938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7306234836578369, + "step": 219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9954323768615723 + }, + { + "episode": 3536, + "epoch": 0.06355825574289103, + "loss/policy_avg": 0.14159545302391052, + "lr": 2.9578220858895708e-06, + "objective/entropy": -69.43865966796875, + "objective/kl": 5.585199356079102, + "objective/non_score_reward": -0.5585199594497681, + "objective/rlhf_reward": -0.11137341179040439, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 20.743200302124023, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7005258202552795, + "step": 220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997767448425293 + }, + { + "episode": 3552, + "epoch": 0.06384584966027969, + "loss/policy_avg": -0.5519238710403442, + "lr": 2.9576303680981596e-06, + "objective/entropy": 208.38470458984375, + "objective/kl": 6.988656997680664, + "objective/non_score_reward": -0.6988657712936401, + "objective/rlhf_reward": -2.395462906360626, + "objective/scores": 0.1, + "policy/approxkl_avg": 93.66447448730469, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7662152051925659, + "step": 221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0369105339050293 + }, + { + "episode": 3568, + "epoch": 0.06413344357766833, + "loss/policy_avg": 0.047732796519994736, + "lr": 2.9574386503067484e-06, + "objective/entropy": 143.40084838867188, + "objective/kl": 11.851188659667969, + "objective/non_score_reward": -1.1851186752319336, + "objective/rlhf_reward": -2.340474939346313, + "objective/scores": 0.6, + "policy/approxkl_avg": 151.91505432128906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7239178419113159, + "step": 222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001242160797119 + }, + { + "episode": 3584, + "epoch": 0.06442103749505698, + "loss/policy_avg": 0.7117222547531128, + "lr": 2.9572469325153376e-06, + "objective/entropy": -145.98013305664062, + "objective/kl": 12.114925384521484, + "objective/non_score_reward": -1.2114924192428589, + "objective/rlhf_reward": -3.1126364628473913, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 156.07591247558594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7104471921920776, + "step": 223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9960601329803467 + }, + { + "episode": 3600, + "epoch": 0.06470863141244562, + "loss/policy_avg": -0.4437275230884552, + "lr": 2.9570552147239264e-06, + "objective/entropy": 7.181800842285156, + "objective/kl": 4.8645524978637695, + "objective/non_score_reward": -0.4864552319049835, + "objective/rlhf_reward": -1.5458209127187728, + "objective/scores": 0.1, + "policy/approxkl_avg": 67.22005462646484, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.3355029821395874, + "step": 224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003509044647217 + }, + { + "episode": 3616, + "epoch": 0.06499622532983428, + "loss/policy_avg": 0.09265188872814178, + "lr": 2.9568634969325152e-06, + "objective/entropy": -261.4568176269531, + "objective/kl": 8.175820350646973, + "objective/non_score_reward": -0.8175821304321289, + "objective/rlhf_reward": -2.870328368991613, + "objective/scores": 0.1, + "policy/approxkl_avg": 53.4001350402832, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6872222423553467, + "step": 225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999666690826416 + }, + { + "episode": 3632, + "epoch": 0.06528381924722292, + "loss/policy_avg": 0.27531012892723083, + "lr": 2.9566717791411045e-06, + "objective/entropy": 15.044029235839844, + "objective/kl": 10.096330642700195, + "objective/non_score_reward": -1.0096330642700195, + "objective/rlhf_reward": -6.038532257080078, + "objective/scores": -0.5, + "policy/approxkl_avg": 120.44912719726562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7716231346130371, + "step": 226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978790283203125 + }, + { + "episode": 3648, + "epoch": 0.06557141316461157, + "loss/policy_avg": 0.27537640929222107, + "lr": 2.9564800613496933e-06, + "objective/entropy": 171.693359375, + "objective/kl": 13.582971572875977, + "objective/non_score_reward": -1.3582972288131714, + "objective/rlhf_reward": -2.5094699307691783, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 178.1964111328125, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6318575143814087, + "step": 227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985480308532715 + }, + { + "episode": 3664, + "epoch": 0.06585900708200021, + "loss/policy_avg": 0.5710182785987854, + "lr": 2.9562883435582825e-06, + "objective/entropy": 163.5762939453125, + "objective/kl": 11.832403182983398, + "objective/non_score_reward": -1.1832401752471924, + "objective/rlhf_reward": -4.332961043715477, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.51448059082031, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7185041904449463, + "step": 228, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0005269050598145 + }, + { + "episode": 3680, + "epoch": 0.06614660099938886, + "loss/policy_avg": 0.2001960575580597, + "lr": 2.9560966257668713e-06, + "objective/entropy": -31.42165756225586, + "objective/kl": 11.342616081237793, + "objective/non_score_reward": -1.1342616081237793, + "objective/rlhf_reward": -2.5896352929639175, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 163.35409545898438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5161740779876709, + "step": 229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981510639190674 + }, + { + "episode": 3696, + "epoch": 0.06643419491677752, + "loss/policy_avg": 0.033837247639894485, + "lr": 2.95590490797546e-06, + "objective/entropy": -127.0381088256836, + "objective/kl": 12.982643127441406, + "objective/non_score_reward": -1.298264503479004, + "objective/rlhf_reward": -7.193058013916016, + "objective/scores": -0.5, + "policy/approxkl_avg": 175.97671508789062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7343233823776245, + "step": 230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994841814041138 + }, + { + "episode": 3712, + "epoch": 0.06672178883416616, + "loss/policy_avg": 0.3433837890625, + "lr": 2.9557131901840494e-06, + "objective/entropy": 68.5723876953125, + "objective/kl": 13.85904598236084, + "objective/non_score_reward": -1.3859045505523682, + "objective/rlhf_reward": -3.8102850029865896, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 221.48330688476562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7557601928710938, + "step": 231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970917701721191 + }, + { + "episode": 3728, + "epoch": 0.06700938275155481, + "loss/policy_avg": 0.2773955464363098, + "lr": 2.955521472392638e-06, + "objective/entropy": -77.472412109375, + "objective/kl": 8.535304069519043, + "objective/non_score_reward": -0.8535304069519043, + "objective/rlhf_reward": -0.4904027923357215, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 31.26820182800293, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.47358882427215576, + "step": 232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977991580963135 + }, + { + "episode": 3744, + "epoch": 0.06729697666894345, + "loss/policy_avg": 0.419773131608963, + "lr": 2.9553297546012274e-06, + "objective/entropy": 151.9324951171875, + "objective/kl": 10.881217956542969, + "objective/non_score_reward": -1.0881218910217285, + "objective/rlhf_reward": -2.4050763649510696, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 90.420166015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7324544787406921, + "step": 233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985604286193848 + }, + { + "episode": 3760, + "epoch": 0.0675845705863321, + "loss/policy_avg": 0.19571278989315033, + "lr": 2.955138036809816e-06, + "objective/entropy": 164.4075927734375, + "objective/kl": 9.931008338928223, + "objective/non_score_reward": -0.9931010007858276, + "objective/rlhf_reward": -1.8496975473323207, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 90.80259704589844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4490511417388916, + "step": 234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9968398809432983 + }, + { + "episode": 3776, + "epoch": 0.06787216450372074, + "loss/policy_avg": -0.0029441099613904953, + "lr": 2.954946319018405e-06, + "objective/entropy": -57.89764404296875, + "objective/kl": 11.663187026977539, + "objective/non_score_reward": -1.1663187742233276, + "objective/rlhf_reward": -2.265275067090988, + "objective/scores": 0.6, + "policy/approxkl_avg": 56.57416915893555, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6279686689376831, + "step": 235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994487762451172 + }, + { + "episode": 3792, + "epoch": 0.0681597584211094, + "loss/policy_avg": 0.19873343408107758, + "lr": 2.954754601226994e-06, + "objective/entropy": 69.95574951171875, + "objective/kl": 2.667611598968506, + "objective/non_score_reward": -0.2667612135410309, + "objective/rlhf_reward": -3.067044734954834, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.46300506591797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6580133438110352, + "step": 236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998089075088501 + }, + { + "episode": 3808, + "epoch": 0.06844735233849804, + "loss/policy_avg": 0.06835653632879257, + "lr": 2.9545628834355827e-06, + "objective/entropy": -18.042882919311523, + "objective/kl": 10.576539993286133, + "objective/non_score_reward": -1.0576539039611816, + "objective/rlhf_reward": -6.230615615844727, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.800323486328125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.38686278462409973, + "step": 237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9957520961761475 + }, + { + "episode": 3824, + "epoch": 0.06873494625588669, + "loss/policy_avg": 0.09460563957691193, + "lr": 2.954371165644172e-06, + "objective/entropy": -168.65773010253906, + "objective/kl": 9.4718017578125, + "objective/non_score_reward": -0.9471801519393921, + "objective/rlhf_reward": -3.388720667362213, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.32270812988281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3954545259475708, + "step": 238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000286817550659 + }, + { + "episode": 3840, + "epoch": 0.06902254017327533, + "loss/policy_avg": -0.28287017345428467, + "lr": 2.9541794478527607e-06, + "objective/entropy": 191.8770751953125, + "objective/kl": 5.481754302978516, + "objective/non_score_reward": -0.5481754541397095, + "objective/rlhf_reward": -4.192701816558838, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.588523864746094, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4681757688522339, + "step": 239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001433849334717 + }, + { + "episode": 3856, + "epoch": 0.06931013409066399, + "loss/policy_avg": 0.4215943217277527, + "lr": 2.9539877300613495e-06, + "objective/entropy": 122.14271545410156, + "objective/kl": 11.92599868774414, + "objective/non_score_reward": -1.1926000118255615, + "objective/rlhf_reward": -4.370399898290634, + "objective/scores": 0.1, + "policy/approxkl_avg": 73.78562927246094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6994770765304565, + "step": 240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991042613983154 + }, + { + "episode": 3872, + "epoch": 0.06959772800805263, + "loss/policy_avg": 0.14229349792003632, + "lr": 2.9537960122699387e-06, + "objective/entropy": 97.91790008544922, + "objective/kl": 11.320087432861328, + "objective/non_score_reward": -1.1320087909698486, + "objective/rlhf_reward": -0.1280353426933285, + "objective/scores": 1.1, + "policy/approxkl_avg": 53.905311584472656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7801535129547119, + "step": 241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0005578994750977 + }, + { + "episode": 3888, + "epoch": 0.06988532192544128, + "loss/policy_avg": 0.6044175624847412, + "lr": 2.9536042944785275e-06, + "objective/entropy": -45.497032165527344, + "objective/kl": 14.7174072265625, + "objective/non_score_reward": -1.47174072265625, + "objective/rlhf_reward": -3.9395517510938003, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 80.24563598632812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4917639493942261, + "step": 242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982421398162842 + }, + { + "episode": 3904, + "epoch": 0.07017291584282992, + "loss/policy_avg": 0.37928763031959534, + "lr": 2.9534125766871168e-06, + "objective/entropy": 85.73689270019531, + "objective/kl": 3.559020519256592, + "objective/non_score_reward": -0.3559020459651947, + "objective/rlhf_reward": 2.9763918161392215, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.9539512395858765, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5831518173217773, + "step": 243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.023195743560791 + }, + { + "episode": 3920, + "epoch": 0.07046050976021857, + "loss/policy_avg": 0.06145331636071205, + "lr": 2.9532208588957056e-06, + "objective/entropy": -83.13604736328125, + "objective/kl": 10.009160995483398, + "objective/non_score_reward": -1.0009161233901978, + "objective/rlhf_reward": -6.003664016723633, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.118873596191406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7157425880432129, + "step": 244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9978001117706299 + }, + { + "episode": 3936, + "epoch": 0.07074810367760721, + "loss/policy_avg": 0.34171396493911743, + "lr": 2.9530291411042944e-06, + "objective/entropy": -138.12054443359375, + "objective/kl": 13.933228492736816, + "objective/non_score_reward": -1.3933229446411133, + "objective/rlhf_reward": -2.649572645069334, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 283.137939453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8090240955352783, + "step": 245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.99606454372406 + }, + { + "episode": 3952, + "epoch": 0.07103569759499587, + "loss/policy_avg": 0.05803845077753067, + "lr": 2.9528374233128836e-06, + "objective/entropy": -0.7659759521484375, + "objective/kl": 10.326383590698242, + "objective/non_score_reward": -1.0326383113861084, + "objective/rlhf_reward": -3.730553215742111, + "objective/scores": 0.1, + "policy/approxkl_avg": 82.2501449584961, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6106427311897278, + "step": 246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0012848377227783 + }, + { + "episode": 3968, + "epoch": 0.0713232915123845, + "loss/policy_avg": 0.27074679732322693, + "lr": 2.9526457055214724e-06, + "objective/entropy": -100.07415771484375, + "objective/kl": 9.287663459777832, + "objective/non_score_reward": -0.9287664294242859, + "objective/rlhf_reward": -1.592359433249507, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 4.526418209075928, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5503559112548828, + "step": 247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0001773834228516 + }, + { + "episode": 3984, + "epoch": 0.07161088542977316, + "loss/policy_avg": 0.1150532141327858, + "lr": 2.9524539877300617e-06, + "objective/entropy": 37.045387268066406, + "objective/kl": 9.54155158996582, + "objective/non_score_reward": -0.9541550874710083, + "objective/rlhf_reward": 0.5833795011043552, + "objective/scores": 1.1, + "policy/approxkl_avg": 118.93629455566406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7818096876144409, + "step": 248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984393119812012 + }, + { + "episode": 4000, + "epoch": 0.07189847934716181, + "loss/policy_avg": 0.11583375930786133, + "lr": 2.9522622699386505e-06, + "objective/entropy": -29.975513458251953, + "objective/kl": 11.691999435424805, + "objective/non_score_reward": -1.169199824333191, + "objective/rlhf_reward": -4.276799207925796, + "objective/scores": 0.1, + "policy/approxkl_avg": 95.38002014160156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6541903018951416, + "step": 249, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976664781570435 + }, + { + "episode": 4016, + "epoch": 0.07218607326455045, + "loss/policy_avg": 0.424482524394989, + "lr": 2.9520705521472393e-06, + "objective/entropy": 108.46614074707031, + "objective/kl": 7.699254989624023, + "objective/non_score_reward": -0.7699254751205444, + "objective/rlhf_reward": -2.679701870679855, + "objective/scores": 0.1, + "policy/approxkl_avg": 38.84117126464844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9610189199447632, + "step": 250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0005178451538086 + }, + { + "episode": 4032, + "epoch": 0.07247366718193911, + "loss/policy_avg": -0.013865754008293152, + "lr": 2.9518788343558285e-06, + "objective/entropy": 2.4157180786132812, + "objective/kl": 8.162057876586914, + "objective/non_score_reward": -0.8162057399749756, + "objective/rlhf_reward": -5.264822959899902, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.96641540527344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6327460408210754, + "step": 251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980905055999756 + }, + { + "episode": 4048, + "epoch": 0.07276126109932775, + "loss/policy_avg": 0.12326370179653168, + "lr": 2.9516871165644173e-06, + "objective/entropy": -18.136714935302734, + "objective/kl": 15.872564315795898, + "objective/non_score_reward": -1.5872564315795898, + "objective/rlhf_reward": -8.34902572631836, + "objective/scores": -0.5, + "policy/approxkl_avg": 183.87185668945312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.614548921585083, + "step": 252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986623525619507 + }, + { + "episode": 4064, + "epoch": 0.0730488550167164, + "loss/policy_avg": 0.42759251594543457, + "lr": 2.951495398773006e-06, + "objective/entropy": 187.13995361328125, + "objective/kl": 15.658686637878418, + "objective/non_score_reward": -1.5658683776855469, + "objective/rlhf_reward": -5.863473868370056, + "objective/scores": 0.1, + "policy/approxkl_avg": 77.336669921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7136948704719543, + "step": 253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998879075050354 + }, + { + "episode": 4080, + "epoch": 0.07333644893410504, + "loss/policy_avg": 0.45976442098617554, + "lr": 2.9513036809815954e-06, + "objective/entropy": -140.05638122558594, + "objective/kl": 3.995250940322876, + "objective/non_score_reward": -0.39952513575553894, + "objective/rlhf_reward": 2.8018994905054573, + "objective/scores": 1.1, + "policy/approxkl_avg": 19.618144989013672, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5429213047027588, + "step": 254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998197078704834 + }, + { + "episode": 4096, + "epoch": 0.0736240428514937, + "loss/policy_avg": -0.02419188618659973, + "lr": 2.951111963190184e-06, + "objective/entropy": -150.40243530273438, + "objective/kl": 5.681851387023926, + "objective/non_score_reward": -0.5681850910186768, + "objective/rlhf_reward": -4.272740364074707, + "objective/scores": -0.5, + "policy/approxkl_avg": 25.766756057739258, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49843645095825195, + "step": 255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990272521972656 + }, + { + "episode": 4112, + "epoch": 0.07391163676888234, + "loss/policy_avg": 0.09387945383787155, + "lr": 2.950920245398773e-06, + "objective/entropy": 21.77161407470703, + "objective/kl": 8.116277694702148, + "objective/non_score_reward": -0.8116278648376465, + "objective/rlhf_reward": -5.246511459350586, + "objective/scores": -0.5, + "policy/approxkl_avg": 51.24212646484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.709270715713501, + "step": 256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979963302612305 + }, + { + "episode": 4128, + "epoch": 0.07419923068627099, + "loss/policy_avg": -0.4069734215736389, + "lr": 2.950728527607362e-06, + "objective/entropy": -174.08395385742188, + "objective/kl": 8.566609382629395, + "objective/non_score_reward": -0.8566610217094421, + "objective/rlhf_reward": -1.7647844008809193, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 61.56124496459961, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4532226324081421, + "step": 257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002131938934326 + }, + { + "episode": 4144, + "epoch": 0.07448682460365963, + "loss/policy_avg": 0.04940726235508919, + "lr": 2.950536809815951e-06, + "objective/entropy": 32.93096923828125, + "objective/kl": 13.826756477355957, + "objective/non_score_reward": -1.3826756477355957, + "objective/rlhf_reward": -7.530702590942383, + "objective/scores": -0.5, + "policy/approxkl_avg": 152.21774291992188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4292905330657959, + "step": 258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9970979690551758 + }, + { + "episode": 4160, + "epoch": 0.07477441852104828, + "loss/policy_avg": 0.08529473841190338, + "lr": 2.95034509202454e-06, + "objective/entropy": 14.531261444091797, + "objective/kl": 6.426525115966797, + "objective/non_score_reward": -0.6426525712013245, + "objective/rlhf_reward": 1.8293897002935413, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.471156597137451, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3390624523162842, + "step": 259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988377094268799 + }, + { + "episode": 4176, + "epoch": 0.07506201243843692, + "loss/policy_avg": 0.6565670371055603, + "lr": 2.9501533742331287e-06, + "objective/entropy": 13.341388702392578, + "objective/kl": 13.047385215759277, + "objective/non_score_reward": -1.3047385215759277, + "objective/rlhf_reward": -3.0962477944054942, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 156.30238342285156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6474744081497192, + "step": 260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990012645721436 + }, + { + "episode": 4192, + "epoch": 0.07534960635582558, + "loss/policy_avg": 0.8194575309753418, + "lr": 2.949961656441718e-06, + "objective/entropy": 39.95905303955078, + "objective/kl": 11.301864624023438, + "objective/non_score_reward": -1.1301864385604858, + "objective/rlhf_reward": -6.520745754241943, + "objective/scores": -0.5, + "policy/approxkl_avg": 100.30358123779297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6240185499191284, + "step": 261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996020793914795 + }, + { + "episode": 4208, + "epoch": 0.07563720027321422, + "loss/policy_avg": 0.9927411079406738, + "lr": 2.9497699386503067e-06, + "objective/entropy": 214.742431640625, + "objective/kl": 10.751565933227539, + "objective/non_score_reward": -1.075156569480896, + "objective/rlhf_reward": -3.900626084208488, + "objective/scores": 0.1, + "policy/approxkl_avg": 190.6407928466797, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5556049346923828, + "step": 262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008435249328613 + }, + { + "episode": 4224, + "epoch": 0.07592479419060287, + "loss/policy_avg": 0.1370810866355896, + "lr": 2.949578220858896e-06, + "objective/entropy": -30.638282775878906, + "objective/kl": 4.496548652648926, + "objective/non_score_reward": -0.4496549069881439, + "objective/rlhf_reward": 0.14879166059023552, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.034202575683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6356778740882874, + "step": 263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985594749450684 + }, + { + "episode": 4240, + "epoch": 0.07621238810799151, + "loss/policy_avg": 0.6080716848373413, + "lr": 2.9493865030674847e-06, + "objective/entropy": -11.881404876708984, + "objective/kl": 10.906417846679688, + "objective/non_score_reward": -1.0906418561935425, + "objective/rlhf_reward": -2.5377384528246627, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 79.76661682128906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5296034812927246, + "step": 264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986059665679932 + }, + { + "episode": 4256, + "epoch": 0.07649998202538016, + "loss/policy_avg": 0.21646641194820404, + "lr": 2.9491947852760736e-06, + "objective/entropy": 142.3988800048828, + "objective/kl": 6.696286201477051, + "objective/non_score_reward": -0.6696287393569946, + "objective/rlhf_reward": 1.7214852511882786, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.300674438476562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6062541007995605, + "step": 265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001516342163086 + }, + { + "episode": 4272, + "epoch": 0.0767875759427688, + "loss/policy_avg": 0.13891346752643585, + "lr": 2.9490030674846628e-06, + "objective/entropy": -61.452545166015625, + "objective/kl": 5.754723072052002, + "objective/non_score_reward": -0.575472354888916, + "objective/rlhf_reward": 0.09811072945594779, + "objective/scores": 0.6, + "policy/approxkl_avg": 18.09847068786621, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.41696321964263916, + "step": 266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999955177307129 + }, + { + "episode": 4288, + "epoch": 0.07707516986015746, + "loss/policy_avg": 0.19311824440956116, + "lr": 2.9488113496932516e-06, + "objective/entropy": 32.384788513183594, + "objective/kl": 10.006759643554688, + "objective/non_score_reward": -1.000675916671753, + "objective/rlhf_reward": -3.6027039051055905, + "objective/scores": 0.1, + "policy/approxkl_avg": 50.9388427734375, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.571370542049408, + "step": 267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998531460762024 + }, + { + "episode": 4304, + "epoch": 0.0773627637775461, + "loss/policy_avg": -0.14073559641838074, + "lr": 2.9486196319018404e-06, + "objective/entropy": 85.10265350341797, + "objective/kl": 15.816366195678711, + "objective/non_score_reward": -1.5816365480422974, + "objective/rlhf_reward": -5.9265462219715115, + "objective/scores": 0.1, + "policy/approxkl_avg": 122.70755004882812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.756123423576355, + "step": 268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997191429138184 + }, + { + "episode": 4320, + "epoch": 0.07765035769493475, + "loss/policy_avg": 0.17501243948936462, + "lr": 2.9484279141104296e-06, + "objective/entropy": 93.92215728759766, + "objective/kl": 9.422307968139648, + "objective/non_score_reward": -0.9422306418418884, + "objective/rlhf_reward": -3.368922537565231, + "objective/scores": 0.1, + "policy/approxkl_avg": 55.133872985839844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5802706480026245, + "step": 269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981038570404053 + }, + { + "episode": 4336, + "epoch": 0.0779379516123234, + "loss/policy_avg": 0.4402735233306885, + "lr": 2.9482361963190184e-06, + "objective/entropy": -10.538581848144531, + "objective/kl": 10.759380340576172, + "objective/non_score_reward": -1.075938105583191, + "objective/rlhf_reward": -2.6418928853875263, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 119.47884368896484, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5788609385490417, + "step": 270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9987006187438965 + }, + { + "episode": 4352, + "epoch": 0.07822554552971205, + "loss/policy_avg": 0.582582414150238, + "lr": 2.9480444785276077e-06, + "objective/entropy": -98.20463562011719, + "objective/kl": 9.743408203125, + "objective/non_score_reward": -0.9743408560752869, + "objective/rlhf_reward": 0.5026364937424663, + "objective/scores": 1.1, + "policy/approxkl_avg": 72.30321502685547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6191748380661011, + "step": 271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998560905456543 + }, + { + "episode": 4368, + "epoch": 0.0785131394471007, + "loss/policy_avg": 0.6289035081863403, + "lr": 2.9478527607361965e-06, + "objective/entropy": -71.34103393554688, + "objective/kl": 12.412795066833496, + "objective/non_score_reward": -1.2412794828414917, + "objective/rlhf_reward": -4.565117752552032, + "objective/scores": 0.1, + "policy/approxkl_avg": 164.9723358154297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.736472487449646, + "step": 272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970333576202393 + }, + { + "episode": 4384, + "epoch": 0.07880073336448934, + "loss/policy_avg": 0.21773582696914673, + "lr": 2.9476610429447853e-06, + "objective/entropy": -0.68145751953125, + "objective/kl": 13.457925796508789, + "objective/non_score_reward": -1.3457924127578735, + "objective/rlhf_reward": -4.98316973298788, + "objective/scores": 0.1, + "policy/approxkl_avg": 254.5444793701172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8520537614822388, + "step": 273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983816146850586 + }, + { + "episode": 4400, + "epoch": 0.079088327281878, + "loss/policy_avg": 0.7082804441452026, + "lr": 2.9474693251533745e-06, + "objective/entropy": -72.47603607177734, + "objective/kl": 11.576058387756348, + "objective/non_score_reward": -1.1576058864593506, + "objective/rlhf_reward": -6.630423545837402, + "objective/scores": -0.5, + "policy/approxkl_avg": 119.74085235595703, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6347866058349609, + "step": 274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995064735412598 + }, + { + "episode": 4416, + "epoch": 0.07937592119926663, + "loss/policy_avg": -0.11256889998912811, + "lr": 2.9472776073619633e-06, + "objective/entropy": -83.20799255371094, + "objective/kl": 3.853982925415039, + "objective/non_score_reward": -0.385398268699646, + "objective/rlhf_reward": 2.8584069401025776, + "objective/scores": 1.1, + "policy/approxkl_avg": 0.9227147102355957, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.39738836884498596, + "step": 275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0011487007141113 + }, + { + "episode": 4432, + "epoch": 0.07966351511665529, + "loss/policy_avg": 0.32202059030532837, + "lr": 2.947085889570552e-06, + "objective/entropy": 71.21481323242188, + "objective/kl": 10.407328605651855, + "objective/non_score_reward": -1.0407328605651855, + "objective/rlhf_reward": -6.162931442260742, + "objective/scores": -0.5, + "policy/approxkl_avg": 108.96805572509766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5844370126724243, + "step": 276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993963241577148 + }, + { + "episode": 4448, + "epoch": 0.07995110903404393, + "loss/policy_avg": -0.023769661784172058, + "lr": 2.9468941717791414e-06, + "objective/entropy": -13.402759552001953, + "objective/kl": 12.098983764648438, + "objective/non_score_reward": -1.2098984718322754, + "objective/rlhf_reward": -6.839593887329102, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.1106595993042, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7059996724128723, + "step": 277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998667240142822 + }, + { + "episode": 4464, + "epoch": 0.08023870295143258, + "loss/policy_avg": 0.3893451690673828, + "lr": 2.9467024539877298e-06, + "objective/entropy": 189.41259765625, + "objective/kl": 6.5235137939453125, + "objective/non_score_reward": -0.6523513793945312, + "objective/rlhf_reward": -2.20940545797348, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.074134826660156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.798334002494812, + "step": 278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0013234615325928 + }, + { + "episode": 4480, + "epoch": 0.08052629686882122, + "loss/policy_avg": 0.0914541631937027, + "lr": 2.946510736196319e-06, + "objective/entropy": 84.65312194824219, + "objective/kl": 10.466255187988281, + "objective/non_score_reward": -1.0466254949569702, + "objective/rlhf_reward": 0.21349813938140905, + "objective/scores": 1.1, + "policy/approxkl_avg": 125.91380310058594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7349311113357544, + "step": 279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984486103057861 + }, + { + "episode": 4496, + "epoch": 0.08081389078620987, + "loss/policy_avg": 0.26043936610221863, + "lr": 2.946319018404908e-06, + "objective/entropy": -58.848392486572266, + "objective/kl": 10.368853569030762, + "objective/non_score_reward": -1.0368852615356445, + "objective/rlhf_reward": -1.7475412249565123, + "objective/scores": 0.6, + "policy/approxkl_avg": 51.58127975463867, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.612343430519104, + "step": 280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989829063415527 + }, + { + "episode": 4512, + "epoch": 0.08110148470359851, + "loss/policy_avg": 0.2319001704454422, + "lr": 2.946127300613497e-06, + "objective/entropy": -36.56258773803711, + "objective/kl": 9.824468612670898, + "objective/non_score_reward": -0.9824467897415161, + "objective/rlhf_reward": -5.9297871589660645, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.27013397216797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7777178883552551, + "step": 281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996426105499268 + }, + { + "episode": 4528, + "epoch": 0.08138907862098717, + "loss/policy_avg": 0.2995202839374542, + "lr": 2.945935582822086e-06, + "objective/entropy": 48.848323822021484, + "objective/kl": 16.30365753173828, + "objective/non_score_reward": -1.6303660869598389, + "objective/rlhf_reward": -6.1214640274643894, + "objective/scores": 0.1, + "policy/approxkl_avg": 165.35614013671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6986551284790039, + "step": 282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979193210601807 + }, + { + "episode": 4544, + "epoch": 0.08167667253837581, + "loss/policy_avg": 0.47233566641807556, + "lr": 2.9457438650306747e-06, + "objective/entropy": 59.0998420715332, + "objective/kl": 12.852258682250977, + "objective/non_score_reward": -1.2852261066436768, + "objective/rlhf_reward": -7.140904426574707, + "objective/scores": -0.5, + "policy/approxkl_avg": 134.68528747558594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.42645326256752014, + "step": 283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972941875457764 + }, + { + "episode": 4560, + "epoch": 0.08196426645576446, + "loss/policy_avg": 0.36607012152671814, + "lr": 2.945552147239264e-06, + "objective/entropy": 53.86030578613281, + "objective/kl": 5.768060684204102, + "objective/non_score_reward": -0.5768060684204102, + "objective/rlhf_reward": 2.0927755475044254, + "objective/scores": 1.1, + "policy/approxkl_avg": 36.973777770996094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7312701940536499, + "step": 284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9965434074401855 + }, + { + "episode": 4576, + "epoch": 0.0822518603731531, + "loss/policy_avg": -0.048812031745910645, + "lr": 2.9453604294478527e-06, + "objective/entropy": 65.61720275878906, + "objective/kl": 8.64478588104248, + "objective/non_score_reward": -0.864478588104248, + "objective/rlhf_reward": 0.9420856922864917, + "objective/scores": 1.1, + "policy/approxkl_avg": 79.18849182128906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6703629493713379, + "step": 285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975764751434326 + }, + { + "episode": 4592, + "epoch": 0.08253945429054176, + "loss/policy_avg": 0.12739452719688416, + "lr": 2.945168711656442e-06, + "objective/entropy": -44.89176940917969, + "objective/kl": 4.6148176193237305, + "objective/non_score_reward": -0.4614817500114441, + "objective/rlhf_reward": 2.554072973877192, + "objective/scores": 1.1, + "policy/approxkl_avg": 13.531970024108887, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6312613487243652, + "step": 286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990942478179932 + }, + { + "episode": 4608, + "epoch": 0.0828270482079304, + "loss/policy_avg": 0.13534197211265564, + "lr": 2.9449769938650308e-06, + "objective/entropy": 234.43975830078125, + "objective/kl": 9.361823081970215, + "objective/non_score_reward": -0.9361822605133057, + "objective/rlhf_reward": -3.3447290569543835, + "objective/scores": 0.1, + "policy/approxkl_avg": 136.4779815673828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5726273059844971, + "step": 287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99815034866333 + }, + { + "episode": 4624, + "epoch": 0.08311464212531905, + "loss/policy_avg": 0.42366448044776917, + "lr": 2.9447852760736196e-06, + "objective/entropy": -113.96307373046875, + "objective/kl": 8.458015441894531, + "objective/non_score_reward": -0.845801591873169, + "objective/rlhf_reward": -5.383206367492676, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.63624572753906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7157449126243591, + "step": 288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999110221862793 + }, + { + "episode": 4640, + "epoch": 0.0834022360427077, + "loss/policy_avg": 0.5396846532821655, + "lr": 2.944593558282209e-06, + "objective/entropy": 132.98178100585938, + "objective/kl": 13.068355560302734, + "objective/non_score_reward": -1.3068355321884155, + "objective/rlhf_reward": -2.8273421287536618, + "objective/scores": 0.6, + "policy/approxkl_avg": 203.39913940429688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8317103385925293, + "step": 289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000296115875244 + }, + { + "episode": 4656, + "epoch": 0.08368982996009634, + "loss/policy_avg": 0.19873766601085663, + "lr": 2.9444018404907976e-06, + "objective/entropy": -150.62936401367188, + "objective/kl": 12.215543746948242, + "objective/non_score_reward": -1.2215545177459717, + "objective/rlhf_reward": -3.282097909514027, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 94.99072265625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8213875889778137, + "step": 290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985655546188354 + }, + { + "episode": 4672, + "epoch": 0.083977423877485, + "loss/policy_avg": 0.02723608911037445, + "lr": 2.9442101226993864e-06, + "objective/entropy": 122.41561889648438, + "objective/kl": 8.34988784790039, + "objective/non_score_reward": -0.8349887132644653, + "objective/rlhf_reward": -5.3399553298950195, + "objective/scores": -0.5, + "policy/approxkl_avg": 45.76279067993164, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.34027427434921265, + "step": 291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0011491775512695 + }, + { + "episode": 4688, + "epoch": 0.08426501779487364, + "loss/policy_avg": 0.32025736570358276, + "lr": 2.9440184049079756e-06, + "objective/entropy": -45.17048645019531, + "objective/kl": 4.012126445770264, + "objective/non_score_reward": -0.4012127220630646, + "objective/rlhf_reward": 2.795149059593678, + "objective/scores": 1.1, + "policy/approxkl_avg": 35.060951232910156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4729318916797638, + "step": 292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977355003356934 + }, + { + "episode": 4704, + "epoch": 0.08455261171226229, + "loss/policy_avg": 0.22684016823768616, + "lr": 2.9438266871165645e-06, + "objective/entropy": 16.97724151611328, + "objective/kl": 14.390579223632812, + "objective/non_score_reward": -1.4390579462051392, + "objective/rlhf_reward": -3.808820496277745, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 180.50299072265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6010082960128784, + "step": 293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9963459968566895 + }, + { + "episode": 4720, + "epoch": 0.08484020562965093, + "loss/policy_avg": 0.44294729828834534, + "lr": 2.9436349693251537e-06, + "objective/entropy": -67.02993774414062, + "objective/kl": 11.463648796081543, + "objective/non_score_reward": -1.1463651657104492, + "objective/rlhf_reward": -0.1854602456092831, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.088623046875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5088694095611572, + "step": 294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967097043991089 + }, + { + "episode": 4736, + "epoch": 0.08512779954703958, + "loss/policy_avg": 0.12860551476478577, + "lr": 2.9434432515337425e-06, + "objective/entropy": 73.75012969970703, + "objective/kl": 11.233770370483398, + "objective/non_score_reward": -1.1233770847320557, + "objective/rlhf_reward": -4.093508290499448, + "objective/scores": 0.1, + "policy/approxkl_avg": 110.15925598144531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49316030740737915, + "step": 295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9951969385147095 + }, + { + "episode": 4752, + "epoch": 0.08541539346442822, + "loss/policy_avg": 0.4201592803001404, + "lr": 2.9432515337423313e-06, + "objective/entropy": 43.006744384765625, + "objective/kl": 12.237357139587402, + "objective/non_score_reward": -1.2237358093261719, + "objective/rlhf_reward": -6.8949432373046875, + "objective/scores": -0.5, + "policy/approxkl_avg": 152.20016479492188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7024413347244263, + "step": 296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9959149360656738 + }, + { + "episode": 4768, + "epoch": 0.08570298738181688, + "loss/policy_avg": 0.17788049578666687, + "lr": 2.9430598159509205e-06, + "objective/entropy": -236.15725708007812, + "objective/kl": 9.120914459228516, + "objective/non_score_reward": -0.9120914936065674, + "objective/rlhf_reward": 0.751633965969086, + "objective/scores": 1.1, + "policy/approxkl_avg": 85.18730163574219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6455779671669006, + "step": 297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998746395111084 + }, + { + "episode": 4784, + "epoch": 0.08599058129920552, + "loss/policy_avg": -0.09744630753993988, + "lr": 2.9428680981595093e-06, + "objective/entropy": 28.533233642578125, + "objective/kl": 6.665700912475586, + "objective/non_score_reward": -0.6665701270103455, + "objective/rlhf_reward": -4.666280269622803, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.560096740722656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8575639724731445, + "step": 298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0005359649658203 + }, + { + "episode": 4800, + "epoch": 0.08627817521659417, + "loss/policy_avg": 0.11877211183309555, + "lr": 2.9426763803680986e-06, + "objective/entropy": 163.72457885742188, + "objective/kl": 7.44589900970459, + "objective/non_score_reward": -0.7445899248123169, + "objective/rlhf_reward": 1.4216401070356373, + "objective/scores": 1.1, + "policy/approxkl_avg": 10.336346626281738, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6966899633407593, + "step": 299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998868703842163 + }, + { + "episode": 4816, + "epoch": 0.08656576913398281, + "loss/policy_avg": 0.3418015241622925, + "lr": 2.9424846625766874e-06, + "objective/entropy": -112.08236694335938, + "objective/kl": 12.987334251403809, + "objective/non_score_reward": -1.2987333536148071, + "objective/rlhf_reward": -0.7949335634708401, + "objective/scores": 1.1, + "policy/approxkl_avg": 233.1175994873047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6791957020759583, + "step": 300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992109537124634 + }, + { + "episode": 4832, + "epoch": 0.08685336305137147, + "loss/policy_avg": 0.17691361904144287, + "lr": 2.942292944785276e-06, + "objective/entropy": 261.80804443359375, + "objective/kl": 16.804275512695312, + "objective/non_score_reward": -1.6804277896881104, + "objective/rlhf_reward": -6.321710979938507, + "objective/scores": 0.1, + "policy/approxkl_avg": 268.57550048828125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6984161734580994, + "step": 301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9969482421875 + }, + { + "episode": 4848, + "epoch": 0.0871409569687601, + "loss/policy_avg": 0.16571223735809326, + "lr": 2.942101226993865e-06, + "objective/entropy": 186.30453491210938, + "objective/kl": 7.996967315673828, + "objective/non_score_reward": -0.799696683883667, + "objective/rlhf_reward": -5.198786735534668, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.46959686279297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5912961959838867, + "step": 302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0007529258728027 + }, + { + "episode": 4864, + "epoch": 0.08742855088614876, + "loss/policy_avg": 0.19111140072345734, + "lr": 2.941909509202454e-06, + "objective/entropy": -6.660182952880859, + "objective/kl": 9.802804946899414, + "objective/non_score_reward": -0.9802805781364441, + "objective/rlhf_reward": -5.9211225509643555, + "objective/scores": -0.5, + "policy/approxkl_avg": 100.47252655029297, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6817684173583984, + "step": 303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000228881835938 + }, + { + "episode": 4880, + "epoch": 0.0877161448035374, + "loss/policy_avg": 0.050571128726005554, + "lr": 2.941717791411043e-06, + "objective/entropy": 106.45433807373047, + "objective/kl": 15.282678604125977, + "objective/non_score_reward": -1.528267741203308, + "objective/rlhf_reward": -8.11307144165039, + "objective/scores": -0.5, + "policy/approxkl_avg": 272.7275085449219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5246777534484863, + "step": 304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000553131103516 + }, + { + "episode": 4896, + "epoch": 0.08800373872092605, + "loss/policy_avg": 0.5568979978561401, + "lr": 2.941526073619632e-06, + "objective/entropy": 16.014564514160156, + "objective/kl": 9.838717460632324, + "objective/non_score_reward": -0.9838719367980957, + "objective/rlhf_reward": -5.935487747192383, + "objective/scores": -0.5, + "policy/approxkl_avg": 102.69760131835938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5866303443908691, + "step": 305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998826026916504 + }, + { + "episode": 4912, + "epoch": 0.08829133263831469, + "loss/policy_avg": 0.19343560934066772, + "lr": 2.9413343558282207e-06, + "objective/entropy": -7.7786407470703125, + "objective/kl": 13.301519393920898, + "objective/non_score_reward": -1.3301520347595215, + "objective/rlhf_reward": -7.320608139038086, + "objective/scores": -0.5, + "policy/approxkl_avg": 99.96537780761719, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6204877495765686, + "step": 306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997145414352417 + }, + { + "episode": 4928, + "epoch": 0.08857892655570335, + "loss/policy_avg": 0.4353540539741516, + "lr": 2.94114263803681e-06, + "objective/entropy": 180.70339965820312, + "objective/kl": 15.014328002929688, + "objective/non_score_reward": -1.5014326572418213, + "objective/rlhf_reward": -4.343870957911598, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 205.107666015625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5958458185195923, + "step": 307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999456763267517 + }, + { + "episode": 4944, + "epoch": 0.088866520473092, + "loss/policy_avg": 0.044183149933815, + "lr": 2.9409509202453987e-06, + "objective/entropy": 147.0219268798828, + "objective/kl": 7.249485969543457, + "objective/non_score_reward": -0.7249486446380615, + "objective/rlhf_reward": -2.49979438483715, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.019502639770508, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8432949185371399, + "step": 308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000023126602173 + }, + { + "episode": 4960, + "epoch": 0.08915411439048064, + "loss/policy_avg": -0.1347326934337616, + "lr": 2.940759202453988e-06, + "objective/entropy": -102.82943725585938, + "objective/kl": 8.855981826782227, + "objective/non_score_reward": -0.8855981826782227, + "objective/rlhf_reward": -5.542392730712891, + "objective/scores": -0.5, + "policy/approxkl_avg": 78.90658569335938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6382081508636475, + "step": 309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998550415039062 + }, + { + "episode": 4976, + "epoch": 0.0894417083078693, + "loss/policy_avg": 0.2769806981086731, + "lr": 2.9405674846625768e-06, + "objective/entropy": 222.6593017578125, + "objective/kl": 12.968841552734375, + "objective/non_score_reward": -1.2968841791152954, + "objective/rlhf_reward": -0.7875365525484082, + "objective/scores": 1.1, + "policy/approxkl_avg": 143.65463256835938, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.644939661026001, + "step": 310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0016446113586426 + }, + { + "episode": 4992, + "epoch": 0.08972930222525793, + "loss/policy_avg": -0.31937313079833984, + "lr": 2.9403757668711656e-06, + "objective/entropy": 224.8351593017578, + "objective/kl": 11.696734428405762, + "objective/non_score_reward": -1.1696734428405762, + "objective/rlhf_reward": -2.5559872857489925, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 78.87408447265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.729614794254303, + "step": 311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000832080841064 + }, + { + "episode": 5008, + "epoch": 0.09001689614264659, + "loss/policy_avg": 0.6097627878189087, + "lr": 2.940184049079755e-06, + "objective/entropy": 106.34514617919922, + "objective/kl": 8.760353088378906, + "objective/non_score_reward": -0.8760353326797485, + "objective/rlhf_reward": -5.504140853881836, + "objective/scores": -0.5, + "policy/approxkl_avg": 121.40504455566406, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6102486848831177, + "step": 312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006418228149414 + }, + { + "episode": 5024, + "epoch": 0.09030449006003523, + "loss/policy_avg": 0.2938482463359833, + "lr": 2.9399923312883436e-06, + "objective/entropy": -114.29545593261719, + "objective/kl": 8.454465866088867, + "objective/non_score_reward": -0.8454465866088867, + "objective/rlhf_reward": -2.9817864209413525, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.16905212402344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6143299341201782, + "step": 313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9990007877349854 + }, + { + "episode": 5040, + "epoch": 0.09059208397742388, + "loss/policy_avg": 0.1566888689994812, + "lr": 2.939800613496933e-06, + "objective/entropy": 114.07573699951172, + "objective/kl": 5.000811576843262, + "objective/non_score_reward": -0.5000811219215393, + "objective/rlhf_reward": -4.000324726104736, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.11898422241211, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5816758871078491, + "step": 314, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986605644226074 + }, + { + "episode": 5056, + "epoch": 0.09087967789481252, + "loss/policy_avg": 0.08153313398361206, + "lr": 2.9396088957055217e-06, + "objective/entropy": -2.472991943359375, + "objective/kl": 15.302618026733398, + "objective/non_score_reward": -1.5302616357803345, + "objective/rlhf_reward": -4.5169265604654125, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 221.7631072998047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7500788569450378, + "step": 315, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996685266494751 + }, + { + "episode": 5072, + "epoch": 0.09116727181220118, + "loss/policy_avg": -0.22540059685707092, + "lr": 2.9394171779141105e-06, + "objective/entropy": 194.83139038085938, + "objective/kl": 6.124555587768555, + "objective/non_score_reward": -0.6124556064605713, + "objective/rlhf_reward": -4.449822425842285, + "objective/scores": -0.5, + "policy/approxkl_avg": 52.98652648925781, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4853079319000244, + "step": 316, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0110392570495605 + }, + { + "episode": 5088, + "epoch": 0.09145486572958982, + "loss/policy_avg": 0.33575987815856934, + "lr": 2.9392254601226997e-06, + "objective/entropy": -246.2069854736328, + "objective/kl": 6.744620323181152, + "objective/non_score_reward": -0.674461841583252, + "objective/rlhf_reward": -0.7504362864064533, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 44.58488464355469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5204676389694214, + "step": 317, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0011723041534424 + }, + { + "episode": 5104, + "epoch": 0.09174245964697847, + "loss/policy_avg": -0.03650900349020958, + "lr": 2.9390337423312885e-06, + "objective/entropy": -38.37519454956055, + "objective/kl": 12.865215301513672, + "objective/non_score_reward": -1.2865217924118042, + "objective/rlhf_reward": -7.146087169647217, + "objective/scores": -0.5, + "policy/approxkl_avg": 179.89398193359375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7440187931060791, + "step": 318, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0004358291625977 + }, + { + "episode": 5120, + "epoch": 0.09203005356436711, + "loss/policy_avg": 0.39611124992370605, + "lr": 2.9388420245398773e-06, + "objective/entropy": -115.19349670410156, + "objective/kl": 7.894246578216553, + "objective/non_score_reward": -0.7894245982170105, + "objective/rlhf_reward": -5.157698631286621, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.43158721923828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6976910829544067, + "step": 319, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998779296875 + }, + { + "episode": 5136, + "epoch": 0.09231764748175576, + "loss/policy_avg": 0.6114602088928223, + "lr": 2.9386503067484665e-06, + "objective/entropy": 105.1552734375, + "objective/kl": 13.109886169433594, + "objective/non_score_reward": -1.3109886646270752, + "objective/rlhf_reward": -0.8439547479152676, + "objective/scores": 1.1, + "policy/approxkl_avg": 112.3132095336914, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5804147720336914, + "step": 320, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989616870880127 + }, + { + "episode": 5152, + "epoch": 0.0926052413991444, + "loss/policy_avg": 0.0743880569934845, + "lr": 2.9384585889570554e-06, + "objective/entropy": 77.44183349609375, + "objective/kl": 8.181852340698242, + "objective/non_score_reward": -0.8181852698326111, + "objective/rlhf_reward": -1.1500348619380332, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 82.51998901367188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5503402948379517, + "step": 321, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0014610290527344 + }, + { + "episode": 5168, + "epoch": 0.09289283531653306, + "loss/policy_avg": 0.2947062849998474, + "lr": 2.9382668711656446e-06, + "objective/entropy": 6.5170745849609375, + "objective/kl": 14.03689956665039, + "objective/non_score_reward": -1.4036900997161865, + "objective/rlhf_reward": -1.214760041236877, + "objective/scores": 1.1, + "policy/approxkl_avg": 161.59352111816406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5341845750808716, + "step": 322, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988269805908203 + }, + { + "episode": 5184, + "epoch": 0.0931804292339217, + "loss/policy_avg": 0.2814280092716217, + "lr": 2.938075153374233e-06, + "objective/entropy": -135.19436645507812, + "objective/kl": 8.1387357711792, + "objective/non_score_reward": -0.8138736486434937, + "objective/rlhf_reward": -2.8554944455623623, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.818918228149414, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.581717312335968, + "step": 323, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997791051864624 + }, + { + "episode": 5200, + "epoch": 0.09346802315131035, + "loss/policy_avg": 0.36961764097213745, + "lr": 2.937883435582822e-06, + "objective/entropy": -77.63428497314453, + "objective/kl": 9.158490180969238, + "objective/non_score_reward": -0.915848970413208, + "objective/rlhf_reward": -5.663395881652832, + "objective/scores": -0.5, + "policy/approxkl_avg": 134.7732696533203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5512831807136536, + "step": 324, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.995112419128418 + }, + { + "episode": 5216, + "epoch": 0.09375561706869899, + "loss/policy_avg": 0.11706581711769104, + "lr": 2.937691717791411e-06, + "objective/entropy": -31.434471130371094, + "objective/kl": 15.156240463256836, + "objective/non_score_reward": -1.5156242847442627, + "objective/rlhf_reward": -8.06249713897705, + "objective/scores": -0.5, + "policy/approxkl_avg": 217.86083984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.458157479763031, + "step": 325, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997313380241394 + }, + { + "episode": 5232, + "epoch": 0.09404321098608764, + "loss/policy_avg": 0.08816379308700562, + "lr": 2.9375e-06, + "objective/entropy": 182.1945343017578, + "objective/kl": 14.343957901000977, + "objective/non_score_reward": -1.4343959093093872, + "objective/rlhf_reward": -2.813864682556364, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 90.6133804321289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.763087272644043, + "step": 326, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991672039031982 + }, + { + "episode": 5248, + "epoch": 0.0943308049034763, + "loss/policy_avg": 0.4887702167034149, + "lr": 2.937308282208589e-06, + "objective/entropy": 276.0743103027344, + "objective/kl": 19.531585693359375, + "objective/non_score_reward": -1.9531588554382324, + "objective/rlhf_reward": -7.412635026872159, + "objective/scores": 0.1, + "policy/approxkl_avg": 292.0692138671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6329343318939209, + "step": 327, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973821640014648 + }, + { + "episode": 5264, + "epoch": 0.09461839882086494, + "loss/policy_avg": 0.19927456974983215, + "lr": 2.937116564417178e-06, + "objective/entropy": -68.87179565429688, + "objective/kl": 9.256897926330566, + "objective/non_score_reward": -0.9256898164749146, + "objective/rlhf_reward": -5.702759265899658, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.61810302734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.825851321220398, + "step": 328, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997621774673462 + }, + { + "episode": 5280, + "epoch": 0.09490599273825359, + "loss/policy_avg": 0.05179551616311073, + "lr": 2.9369248466257667e-06, + "objective/entropy": 154.8633575439453, + "objective/kl": 11.498334884643555, + "objective/non_score_reward": -1.1498336791992188, + "objective/rlhf_reward": -1.6756155534994333, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 41.51012420654297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6221466064453125, + "step": 329, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999433994293213 + }, + { + "episode": 5296, + "epoch": 0.09519358665564223, + "loss/policy_avg": 0.7231928110122681, + "lr": 2.936733128834356e-06, + "objective/entropy": 186.2045135498047, + "objective/kl": 10.730362892150879, + "objective/non_score_reward": -1.0730363130569458, + "objective/rlhf_reward": -3.8921451330184933, + "objective/scores": 0.1, + "policy/approxkl_avg": 121.8580322265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7908709645271301, + "step": 330, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002248287200928 + }, + { + "episode": 5312, + "epoch": 0.09548118057303089, + "loss/policy_avg": 0.2208048403263092, + "lr": 2.9365414110429447e-06, + "objective/entropy": -59.700164794921875, + "objective/kl": 16.838346481323242, + "objective/non_score_reward": -1.6838349103927612, + "objective/rlhf_reward": -8.735339164733887, + "objective/scores": -0.5, + "policy/approxkl_avg": 154.73214721679688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7444063425064087, + "step": 331, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9977869987487793 + }, + { + "episode": 5328, + "epoch": 0.09576877449041953, + "loss/policy_avg": 0.06984035670757294, + "lr": 2.936349693251534e-06, + "objective/entropy": 54.201515197753906, + "objective/kl": 8.278887748718262, + "objective/non_score_reward": -0.8278888463973999, + "objective/rlhf_reward": -5.311555862426758, + "objective/scores": -0.5, + "policy/approxkl_avg": 91.28977966308594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7553679943084717, + "step": 332, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997904300689697 + }, + { + "episode": 5344, + "epoch": 0.09605636840780818, + "loss/policy_avg": 0.3257616460323334, + "lr": 2.9361579754601228e-06, + "objective/entropy": -28.56784439086914, + "objective/kl": 12.646832466125488, + "objective/non_score_reward": -1.2646832466125488, + "objective/rlhf_reward": -7.058732986450195, + "objective/scores": -0.5, + "policy/approxkl_avg": 129.71322631835938, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5907671451568604, + "step": 333, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982057809829712 + }, + { + "episode": 5360, + "epoch": 0.09634396232519682, + "loss/policy_avg": 0.4729722738265991, + "lr": 2.9359662576687116e-06, + "objective/entropy": -150.07943725585938, + "objective/kl": 11.293041229248047, + "objective/non_score_reward": -1.1293039321899414, + "objective/rlhf_reward": -4.1172159075737, + "objective/scores": 0.1, + "policy/approxkl_avg": 144.25387573242188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6657143831253052, + "step": 334, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998594045639038 + }, + { + "episode": 5376, + "epoch": 0.09663155624258547, + "loss/policy_avg": 0.2886536419391632, + "lr": 2.935774539877301e-06, + "objective/entropy": -131.1785125732422, + "objective/kl": 10.785483360290527, + "objective/non_score_reward": -1.0785483121871948, + "objective/rlhf_reward": -3.9141932338476177, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.560306549072266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8351963758468628, + "step": 335, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998948335647583 + }, + { + "episode": 5392, + "epoch": 0.09691915015997411, + "loss/policy_avg": 1.0824708938598633, + "lr": 2.9355828220858896e-06, + "objective/entropy": -5.367637634277344, + "objective/kl": 14.300538063049316, + "objective/non_score_reward": -1.430053949356079, + "objective/rlhf_reward": -2.7964971407663555, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 151.68798828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5800391435623169, + "step": 336, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980026483535767 + }, + { + "episode": 5408, + "epoch": 0.09720674407736277, + "loss/policy_avg": 0.3940733075141907, + "lr": 2.935391104294479e-06, + "objective/entropy": -165.78109741210938, + "objective/kl": 15.360054016113281, + "objective/non_score_reward": -1.5360053777694702, + "objective/rlhf_reward": -1.7440215110778805, + "objective/scores": 1.1, + "policy/approxkl_avg": 114.84159088134766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6686538457870483, + "step": 337, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000908374786377 + }, + { + "episode": 5424, + "epoch": 0.0974943379947514, + "loss/policy_avg": 0.07578772306442261, + "lr": 2.9351993865030677e-06, + "objective/entropy": 17.821250915527344, + "objective/kl": 17.395343780517578, + "objective/non_score_reward": -1.739534616470337, + "objective/rlhf_reward": -4.83543178655294, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 294.8856506347656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6329281330108643, + "step": 338, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982552528381348 + }, + { + "episode": 5440, + "epoch": 0.09778193191214006, + "loss/policy_avg": -0.0766182690858841, + "lr": 2.9350076687116565e-06, + "objective/entropy": -72.60086059570312, + "objective/kl": 9.59086799621582, + "objective/non_score_reward": -0.9590868949890137, + "objective/rlhf_reward": -5.836347579956055, + "objective/scores": -0.5, + "policy/approxkl_avg": 61.654563903808594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8256030678749084, + "step": 339, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.002091407775879 + }, + { + "episode": 5456, + "epoch": 0.0980695258295287, + "loss/policy_avg": 0.039667725563049316, + "lr": 2.9348159509202457e-06, + "objective/entropy": 194.9036865234375, + "objective/kl": 9.621345520019531, + "objective/non_score_reward": -0.9621344804763794, + "objective/rlhf_reward": -3.4485378623008724, + "objective/scores": 0.1, + "policy/approxkl_avg": 69.39463806152344, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.49096935987472534, + "step": 340, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0042881965637207 + }, + { + "episode": 5472, + "epoch": 0.09835711974691735, + "loss/policy_avg": 0.37447261810302734, + "lr": 2.9346242331288345e-06, + "objective/entropy": 5.140836715698242, + "objective/kl": 12.80933666229248, + "objective/non_score_reward": -1.2809334993362427, + "objective/rlhf_reward": -3.001028003469978, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 136.77401733398438, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6044674515724182, + "step": 341, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971853494644165 + }, + { + "episode": 5488, + "epoch": 0.098644713664306, + "loss/policy_avg": 0.07032056152820587, + "lr": 2.9344325153374233e-06, + "objective/entropy": 34.30079650878906, + "objective/kl": 11.535228729248047, + "objective/non_score_reward": -1.1535229682922363, + "objective/rlhf_reward": -2.4913854024567943, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 80.33157348632812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5720343589782715, + "step": 342, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973433017730713 + }, + { + "episode": 5504, + "epoch": 0.09893230758169465, + "loss/policy_avg": 0.039407968521118164, + "lr": 2.9342407975460126e-06, + "objective/entropy": 29.278091430664062, + "objective/kl": 1.9333374500274658, + "objective/non_score_reward": -0.19333375990390778, + "objective/rlhf_reward": 1.0514937088164578, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 2.3760266304016113, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8676018714904785, + "step": 343, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0017752647399902 + }, + { + "episode": 5520, + "epoch": 0.09921990149908329, + "loss/policy_avg": 0.3448472023010254, + "lr": 2.9340490797546014e-06, + "objective/entropy": -73.16712951660156, + "objective/kl": 12.475850105285645, + "objective/non_score_reward": -1.2475850582122803, + "objective/rlhf_reward": -2.066621486784193, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 89.76551818847656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4174689054489136, + "step": 344, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997223973274231 + }, + { + "episode": 5536, + "epoch": 0.09950749541647194, + "loss/policy_avg": 0.019372761249542236, + "lr": 2.93385736196319e-06, + "objective/entropy": 100.60924530029297, + "objective/kl": 12.531920433044434, + "objective/non_score_reward": -1.2531919479370117, + "objective/rlhf_reward": -4.612768149375915, + "objective/scores": 0.1, + "policy/approxkl_avg": 26.791622161865234, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.47664302587509155, + "step": 345, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0009078979492188 + }, + { + "episode": 5552, + "epoch": 0.0997950893338606, + "loss/policy_avg": 0.6332914233207703, + "lr": 2.933665644171779e-06, + "objective/entropy": 134.54344177246094, + "objective/kl": 11.1735200881958, + "objective/non_score_reward": -1.11735200881958, + "objective/rlhf_reward": -4.0694083034992214, + "objective/scores": 0.1, + "policy/approxkl_avg": 68.55665588378906, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.561040997505188, + "step": 346, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998960256576538 + }, + { + "episode": 5568, + "epoch": 0.10008268325124924, + "loss/policy_avg": 0.11013670265674591, + "lr": 2.9334739263803682e-06, + "objective/entropy": -63.645904541015625, + "objective/kl": 14.62928581237793, + "objective/non_score_reward": -1.4629285335540771, + "objective/rlhf_reward": -7.851714134216309, + "objective/scores": -0.5, + "policy/approxkl_avg": 86.95831298828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7983701229095459, + "step": 347, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993393421173096 + }, + { + "episode": 5584, + "epoch": 0.10037027716863789, + "loss/policy_avg": 0.5474449396133423, + "lr": 2.933282208588957e-06, + "objective/entropy": -133.3090362548828, + "objective/kl": 13.566909790039062, + "objective/non_score_reward": -1.3566908836364746, + "objective/rlhf_reward": -1.0267638623714443, + "objective/scores": 1.1, + "policy/approxkl_avg": 108.0693359375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7627835273742676, + "step": 348, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999748706817627 + }, + { + "episode": 5600, + "epoch": 0.10065787108602653, + "loss/policy_avg": 0.07263286411762238, + "lr": 2.933090490797546e-06, + "objective/entropy": 126.58871459960938, + "objective/kl": 9.20844554901123, + "objective/non_score_reward": -0.920844554901123, + "objective/rlhf_reward": 0.7166217207908634, + "objective/scores": 1.1, + "policy/approxkl_avg": 49.46379089355469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9294000864028931, + "step": 349, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979853630065918 + }, + { + "episode": 5616, + "epoch": 0.10094546500341518, + "loss/policy_avg": 0.1791333109140396, + "lr": 2.932898773006135e-06, + "objective/entropy": 0.43863677978515625, + "objective/kl": 8.826881408691406, + "objective/non_score_reward": -0.8826882243156433, + "objective/rlhf_reward": -1.130752792954445, + "objective/scores": 0.6, + "policy/approxkl_avg": 41.58755874633789, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6624951362609863, + "step": 350, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989933967590332 + }, + { + "episode": 5632, + "epoch": 0.10123305892080382, + "loss/policy_avg": 0.20636233687400818, + "lr": 2.932707055214724e-06, + "objective/entropy": 150.28713989257812, + "objective/kl": 8.030426025390625, + "objective/non_score_reward": -0.8030425906181335, + "objective/rlhf_reward": -5.212170124053955, + "objective/scores": -0.5, + "policy/approxkl_avg": 98.02102661132812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7396011352539062, + "step": 351, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004942417144775 + }, + { + "episode": 5648, + "epoch": 0.10152065283819248, + "loss/policy_avg": 0.07704152166843414, + "lr": 2.932515337423313e-06, + "objective/entropy": 49.47189712524414, + "objective/kl": 14.631547927856445, + "objective/non_score_reward": -1.4631547927856445, + "objective/rlhf_reward": -5.45261919721961, + "objective/scores": 0.1, + "policy/approxkl_avg": 92.44937133789062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7191329002380371, + "step": 352, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998352527618408 + }, + { + "episode": 5664, + "epoch": 0.10180824675558112, + "loss/policy_avg": 0.09739228338003159, + "lr": 2.932323619631902e-06, + "objective/entropy": -24.802486419677734, + "objective/kl": 10.526655197143555, + "objective/non_score_reward": -1.0526655912399292, + "objective/rlhf_reward": 0.1893375083804134, + "objective/scores": 1.1, + "policy/approxkl_avg": 132.25132751464844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6535590887069702, + "step": 353, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997605562210083 + }, + { + "episode": 5680, + "epoch": 0.10209584067296977, + "loss/policy_avg": 0.13763202726840973, + "lr": 2.9321319018404907e-06, + "objective/entropy": -119.98158264160156, + "objective/kl": 5.913897514343262, + "objective/non_score_reward": -0.5913897752761841, + "objective/rlhf_reward": -1.9655589669942855, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.558517456054688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6357654333114624, + "step": 354, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000248908996582 + }, + { + "episode": 5696, + "epoch": 0.10238343459035841, + "loss/policy_avg": 0.527988851070404, + "lr": 2.93194018404908e-06, + "objective/entropy": -29.399810791015625, + "objective/kl": 13.658191680908203, + "objective/non_score_reward": -1.3658192157745361, + "objective/rlhf_reward": -3.063276922702789, + "objective/scores": 0.6, + "policy/approxkl_avg": 92.77324676513672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.728462815284729, + "step": 355, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9965999126434326 + }, + { + "episode": 5712, + "epoch": 0.10267102850774706, + "loss/policy_avg": 0.6842390298843384, + "lr": 2.9317484662576688e-06, + "objective/entropy": 109.09453582763672, + "objective/kl": 10.667269706726074, + "objective/non_score_reward": -1.0667269229888916, + "objective/rlhf_reward": -2.710648416486338, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 69.14006042480469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7146704196929932, + "step": 356, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998762607574463 + }, + { + "episode": 5728, + "epoch": 0.1029586224251357, + "loss/policy_avg": 0.07212770730257034, + "lr": 2.9315567484662576e-06, + "objective/entropy": -49.94731903076172, + "objective/kl": 2.350062370300293, + "objective/non_score_reward": -0.23500625789165497, + "objective/rlhf_reward": -0.5400250017642976, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.055467128753662, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5184276103973389, + "step": 357, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0012760162353516 + }, + { + "episode": 5744, + "epoch": 0.10324621634252436, + "loss/policy_avg": -0.052633900195360184, + "lr": 2.931365030674847e-06, + "objective/entropy": -12.867652893066406, + "objective/kl": 7.817996025085449, + "objective/non_score_reward": -0.7817996740341187, + "objective/rlhf_reward": -5.127198219299316, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.15538787841797, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7452627420425415, + "step": 358, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967327117919922 + }, + { + "episode": 5760, + "epoch": 0.103533810259913, + "loss/policy_avg": 0.3177230954170227, + "lr": 2.9311733128834356e-06, + "objective/entropy": 126.2174301147461, + "objective/kl": 15.039046287536621, + "objective/non_score_reward": -1.5039048194885254, + "objective/rlhf_reward": -8.015619277954102, + "objective/scores": -0.5, + "policy/approxkl_avg": 148.06988525390625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5116921067237854, + "step": 359, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9969123601913452 + }, + { + "episode": 5776, + "epoch": 0.10382140417730165, + "loss/policy_avg": 0.015442397445440292, + "lr": 2.930981595092025e-06, + "objective/entropy": 113.17227935791016, + "objective/kl": 15.746637344360352, + "objective/non_score_reward": -1.574663758277893, + "objective/rlhf_reward": -5.898655241727829, + "objective/scores": 0.1, + "policy/approxkl_avg": 97.0933837890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6202758550643921, + "step": 360, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986428022384644 + }, + { + "episode": 5792, + "epoch": 0.10410899809469029, + "loss/policy_avg": 0.2362920194864273, + "lr": 2.9307898773006137e-06, + "objective/entropy": -133.8544464111328, + "objective/kl": 9.721014022827148, + "objective/non_score_reward": -0.9721014499664307, + "objective/rlhf_reward": 0.5115940213203434, + "objective/scores": 1.1, + "policy/approxkl_avg": 94.33212280273438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6412885189056396, + "step": 361, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975833892822266 + }, + { + "episode": 5808, + "epoch": 0.10439659201207895, + "loss/policy_avg": 0.16871516406536102, + "lr": 2.9305981595092025e-06, + "objective/entropy": -65.79563903808594, + "objective/kl": 13.480731964111328, + "objective/non_score_reward": -1.3480732440948486, + "objective/rlhf_reward": -2.468574051500532, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 170.740234375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7938269972801208, + "step": 362, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986903667449951 + }, + { + "episode": 5824, + "epoch": 0.10468418592946759, + "loss/policy_avg": 0.2472498118877411, + "lr": 2.9304064417177917e-06, + "objective/entropy": 209.15557861328125, + "objective/kl": 8.526037216186523, + "objective/non_score_reward": -0.8526037335395813, + "objective/rlhf_reward": 0.989584976434708, + "objective/scores": 1.1, + "policy/approxkl_avg": 67.1485366821289, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9279449582099915, + "step": 363, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000206470489502 + }, + { + "episode": 5840, + "epoch": 0.10497177984685624, + "loss/policy_avg": -0.05213417112827301, + "lr": 2.9302147239263805e-06, + "objective/entropy": -235.3236541748047, + "objective/kl": 16.88799285888672, + "objective/non_score_reward": -1.688799262046814, + "objective/rlhf_reward": -6.355197063088417, + "objective/scores": 0.1, + "policy/approxkl_avg": 340.7134094238281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7127069234848022, + "step": 364, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987154006958008 + }, + { + "episode": 5856, + "epoch": 0.10525937376424489, + "loss/policy_avg": 0.2484489381313324, + "lr": 2.9300230061349698e-06, + "objective/entropy": -33.450096130371094, + "objective/kl": 13.260076522827148, + "objective/non_score_reward": -1.326007604598999, + "objective/rlhf_reward": -4.9040304780006405, + "objective/scores": 0.1, + "policy/approxkl_avg": 131.92379760742188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5392622947692871, + "step": 365, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975396394729614 + }, + { + "episode": 5872, + "epoch": 0.10554696768163353, + "loss/policy_avg": -0.4047367572784424, + "lr": 2.9298312883435586e-06, + "objective/entropy": 82.97904968261719, + "objective/kl": 7.515501499176025, + "objective/non_score_reward": -0.7515501976013184, + "objective/rlhf_reward": -0.082481582404348, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 43.22978973388672, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5681832432746887, + "step": 366, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999988079071045 + }, + { + "episode": 5888, + "epoch": 0.10583456159902219, + "loss/policy_avg": 0.2642351984977722, + "lr": 2.929639570552147e-06, + "objective/entropy": 75.05047607421875, + "objective/kl": 7.34889030456543, + "objective/non_score_reward": -0.734889030456543, + "objective/rlhf_reward": 1.4604437291622165, + "objective/scores": 1.1, + "policy/approxkl_avg": 24.294891357421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6089534759521484, + "step": 367, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984581470489502 + }, + { + "episode": 5904, + "epoch": 0.10612215551641083, + "loss/policy_avg": 0.4635845720767975, + "lr": 2.929447852760736e-06, + "objective/entropy": 111.49870300292969, + "objective/kl": 14.92940902709961, + "objective/non_score_reward": -1.4929410219192505, + "objective/rlhf_reward": -1.5717640727758404, + "objective/scores": 1.1, + "policy/approxkl_avg": 170.13226318359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.896259069442749, + "step": 368, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.99690842628479 + }, + { + "episode": 5920, + "epoch": 0.10640974943379948, + "loss/policy_avg": 0.0428597554564476, + "lr": 2.929256134969325e-06, + "objective/entropy": -17.536102294921875, + "objective/kl": 13.898289680480957, + "objective/non_score_reward": -1.3898290395736694, + "objective/rlhf_reward": -1.1593163371086117, + "objective/scores": 1.1, + "policy/approxkl_avg": 113.92097473144531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5009787678718567, + "step": 369, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998550534248352 + }, + { + "episode": 5936, + "epoch": 0.10669734335118812, + "loss/policy_avg": 0.4961566627025604, + "lr": 2.9290644171779142e-06, + "objective/entropy": 40.97224426269531, + "objective/kl": 8.79596996307373, + "objective/non_score_reward": -0.8795971274375916, + "objective/rlhf_reward": -0.5946695550691811, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 88.30393981933594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6913511753082275, + "step": 370, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971789121627808 + }, + { + "episode": 5952, + "epoch": 0.10698493726857677, + "loss/policy_avg": 0.3505915701389313, + "lr": 2.928872699386503e-06, + "objective/entropy": -14.86764907836914, + "objective/kl": 13.301910400390625, + "objective/non_score_reward": -1.3301911354064941, + "objective/rlhf_reward": -3.495935942205499, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 67.43904113769531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5367602109909058, + "step": 371, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998258352279663 + }, + { + "episode": 5968, + "epoch": 0.10727253118596541, + "loss/policy_avg": 0.04512263089418411, + "lr": 2.928680981595092e-06, + "objective/entropy": 106.7132568359375, + "objective/kl": 7.053742408752441, + "objective/non_score_reward": -0.7053742408752441, + "objective/rlhf_reward": -2.4214967399835583, + "objective/scores": 0.1, + "policy/approxkl_avg": 64.6439437866211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6978051066398621, + "step": 372, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0001304149627686 + }, + { + "episode": 5984, + "epoch": 0.10756012510335407, + "loss/policy_avg": 0.10745556652545929, + "lr": 2.928489263803681e-06, + "objective/entropy": -95.63072967529297, + "objective/kl": 16.271547317504883, + "objective/non_score_reward": -1.6271545886993408, + "objective/rlhf_reward": -2.1086182355880734, + "objective/scores": 1.1, + "policy/approxkl_avg": 83.08837890625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8408117294311523, + "step": 373, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9974865913391113 + }, + { + "episode": 6000, + "epoch": 0.10784771902074271, + "loss/policy_avg": 0.18899598717689514, + "lr": 2.92829754601227e-06, + "objective/entropy": -149.71536254882812, + "objective/kl": 12.09565544128418, + "objective/non_score_reward": -1.2095654010772705, + "objective/rlhf_reward": -0.438261783123016, + "objective/scores": 1.1, + "policy/approxkl_avg": 102.32780456542969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5978922843933105, + "step": 374, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985668659210205 + }, + { + "episode": 6016, + "epoch": 0.10813531293813136, + "loss/policy_avg": 0.0514984093606472, + "lr": 2.928105828220859e-06, + "objective/entropy": 53.81720733642578, + "objective/kl": 13.123618125915527, + "objective/non_score_reward": -1.3123618364334106, + "objective/rlhf_reward": -3.4246185078945865, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 97.61923217773438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.561882495880127, + "step": 375, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986340999603271 + }, + { + "episode": 6032, + "epoch": 0.10842290685552, + "loss/policy_avg": 0.05128341168165207, + "lr": 2.927914110429448e-06, + "objective/entropy": -142.33819580078125, + "objective/kl": 12.197196960449219, + "objective/non_score_reward": -1.219719648361206, + "objective/rlhf_reward": -3.274758789602833, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 29.826644897460938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7544271945953369, + "step": 376, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0010008811950684 + }, + { + "episode": 6048, + "epoch": 0.10871050077290866, + "loss/policy_avg": 0.3710365891456604, + "lr": 2.9277223926380367e-06, + "objective/entropy": -158.48403930664062, + "objective/kl": 12.475811958312988, + "objective/non_score_reward": -1.2475812435150146, + "objective/rlhf_reward": -6.990324974060059, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.358436584472656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7211358547210693, + "step": 377, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9964120388031006 + }, + { + "episode": 6064, + "epoch": 0.1089980946902973, + "loss/policy_avg": 0.44148433208465576, + "lr": 2.927530674846626e-06, + "objective/entropy": 203.32000732421875, + "objective/kl": 11.192790985107422, + "objective/non_score_reward": -1.1192790269851685, + "objective/rlhf_reward": -0.07711610794067347, + "objective/scores": 1.1, + "policy/approxkl_avg": 140.77037048339844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49844634532928467, + "step": 378, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996138334274292 + }, + { + "episode": 6080, + "epoch": 0.10928568860768595, + "loss/policy_avg": 0.07269307225942612, + "lr": 2.9273389570552148e-06, + "objective/entropy": -48.06731414794922, + "objective/kl": 7.008990287780762, + "objective/non_score_reward": -0.7008991241455078, + "objective/rlhf_reward": -0.40359642207622537, + "objective/scores": 0.6, + "policy/approxkl_avg": 30.824771881103516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.765288233757019, + "step": 379, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009591579437256 + }, + { + "episode": 6096, + "epoch": 0.10957328252507459, + "loss/policy_avg": 0.3331525921821594, + "lr": 2.9271472392638036e-06, + "objective/entropy": -129.8748779296875, + "objective/kl": 6.912174224853516, + "objective/non_score_reward": -0.6912174224853516, + "objective/rlhf_reward": -2.3648696750402447, + "objective/scores": 0.1, + "policy/approxkl_avg": 86.31391143798828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7057325839996338, + "step": 380, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999925136566162 + }, + { + "episode": 6112, + "epoch": 0.10986087644246324, + "loss/policy_avg": 0.2209397256374359, + "lr": 2.926955521472393e-06, + "objective/entropy": -17.206472396850586, + "objective/kl": 12.392889022827148, + "objective/non_score_reward": -1.2392890453338623, + "objective/rlhf_reward": -3.2238229076067606, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 61.33112716674805, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7327935695648193, + "step": 381, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999511957168579 + }, + { + "episode": 6128, + "epoch": 0.11014847035985188, + "loss/policy_avg": 0.015440240502357483, + "lr": 2.9267638036809816e-06, + "objective/entropy": 136.05276489257812, + "objective/kl": 10.93641471862793, + "objective/non_score_reward": -1.0936416387557983, + "objective/rlhf_reward": -3.9745664507150646, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.861133575439453, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.642856240272522, + "step": 382, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005943775177 + }, + { + "episode": 6144, + "epoch": 0.11043606427724054, + "loss/policy_avg": 0.4618483781814575, + "lr": 2.926572085889571e-06, + "objective/entropy": 248.1273193359375, + "objective/kl": 10.450383186340332, + "objective/non_score_reward": -1.0450382232666016, + "objective/rlhf_reward": -6.180152893066406, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.98204231262207, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.585578441619873, + "step": 383, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986340999603271 + }, + { + "episode": 6160, + "epoch": 0.11072365819462919, + "loss/policy_avg": 0.527269721031189, + "lr": 2.9263803680981597e-06, + "objective/entropy": -85.81866455078125, + "objective/kl": 7.232128143310547, + "objective/non_score_reward": -0.7232127785682678, + "objective/rlhf_reward": -0.4928510844707489, + "objective/scores": 0.6, + "policy/approxkl_avg": 35.018524169921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6665592789649963, + "step": 384, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9987972974777222 + }, + { + "episode": 6176, + "epoch": 0.11101125211201783, + "loss/policy_avg": 0.012356449849903584, + "lr": 2.9261886503067485e-06, + "objective/entropy": -87.99043273925781, + "objective/kl": 15.716808319091797, + "objective/non_score_reward": -1.5716807842254639, + "objective/rlhf_reward": -1.8867232263088223, + "objective/scores": 1.1, + "policy/approxkl_avg": 112.06707763671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8166660070419312, + "step": 385, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980388879776 + }, + { + "episode": 6192, + "epoch": 0.11129884602940648, + "loss/policy_avg": 0.21891015768051147, + "lr": 2.9259969325153377e-06, + "objective/entropy": 149.92320251464844, + "objective/kl": 15.745594024658203, + "objective/non_score_reward": -1.5745596885681152, + "objective/rlhf_reward": -8.298238754272461, + "objective/scores": -0.5, + "policy/approxkl_avg": 270.8348693847656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5218294858932495, + "step": 386, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982883930206299 + }, + { + "episode": 6208, + "epoch": 0.11158643994679512, + "loss/policy_avg": 0.1566510945558548, + "lr": 2.9258052147239265e-06, + "objective/entropy": 279.3895263671875, + "objective/kl": 14.221332550048828, + "objective/non_score_reward": -1.4221333265304565, + "objective/rlhf_reward": -7.688533306121826, + "objective/scores": -0.5, + "policy/approxkl_avg": 161.9359588623047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9130169153213501, + "step": 387, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984962940216064 + }, + { + "episode": 6224, + "epoch": 0.11187403386418378, + "loss/policy_avg": 0.3156845271587372, + "lr": 2.9256134969325158e-06, + "objective/entropy": -91.46309661865234, + "objective/kl": 5.5192670822143555, + "objective/non_score_reward": -0.5519267320632935, + "objective/rlhf_reward": -4.207706928253174, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.50323486328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4393225312232971, + "step": 388, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989399909973145 + }, + { + "episode": 6240, + "epoch": 0.11216162778157242, + "loss/policy_avg": -0.24140848219394684, + "lr": 2.925421779141104e-06, + "objective/entropy": -6.909185409545898, + "objective/kl": 7.97199821472168, + "objective/non_score_reward": -0.7971999049186707, + "objective/rlhf_reward": -1.066093357578788, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 99.13380432128906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4111158847808838, + "step": 389, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.03024959564209 + }, + { + "episode": 6256, + "epoch": 0.11244922169896107, + "loss/policy_avg": 0.10580594837665558, + "lr": 2.9252300613496934e-06, + "objective/entropy": 139.240234375, + "objective/kl": 7.618363857269287, + "objective/non_score_reward": -0.7618364095687866, + "objective/rlhf_reward": -0.6473455265164376, + "objective/scores": 0.6, + "policy/approxkl_avg": 75.60391998291016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6820776462554932, + "step": 390, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000373363494873 + }, + { + "episode": 6272, + "epoch": 0.11273681561634971, + "loss/policy_avg": 0.23864039778709412, + "lr": 2.925038343558282e-06, + "objective/entropy": -165.75607299804688, + "objective/kl": 9.443279266357422, + "objective/non_score_reward": -0.9443280696868896, + "objective/rlhf_reward": -5.777312278747559, + "objective/scores": -0.5, + "policy/approxkl_avg": 73.20270538330078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6525593996047974, + "step": 391, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987553358078003 + }, + { + "episode": 6288, + "epoch": 0.11302440953373837, + "loss/policy_avg": 0.01731543242931366, + "lr": 2.924846625766871e-06, + "objective/entropy": 202.19996643066406, + "objective/kl": 15.700738906860352, + "objective/non_score_reward": -1.5700738430023193, + "objective/rlhf_reward": -5.880295610427856, + "objective/scores": 0.1, + "policy/approxkl_avg": 44.909934997558594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5537660121917725, + "step": 392, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973206520080566 + }, + { + "episode": 6304, + "epoch": 0.113312003451127, + "loss/policy_avg": 0.37011197209358215, + "lr": 2.9246549079754602e-06, + "objective/entropy": -100.13412475585938, + "objective/kl": 7.34639835357666, + "objective/non_score_reward": -0.7346398234367371, + "objective/rlhf_reward": -0.5385593235492707, + "objective/scores": 0.6, + "policy/approxkl_avg": 39.805572509765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5902200937271118, + "step": 393, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974175691604614 + }, + { + "episode": 6320, + "epoch": 0.11359959736851566, + "loss/policy_avg": 0.1728161722421646, + "lr": 2.924463190184049e-06, + "objective/entropy": 12.221923828125, + "objective/kl": 8.158146858215332, + "objective/non_score_reward": -0.8158146142959595, + "objective/rlhf_reward": -2.863258576393127, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.08139419555664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6115730404853821, + "step": 394, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9969968795776367 + }, + { + "episode": 6336, + "epoch": 0.1138871912859043, + "loss/policy_avg": 0.1638275384902954, + "lr": 2.924271472392638e-06, + "objective/entropy": -92.78604125976562, + "objective/kl": 10.350639343261719, + "objective/non_score_reward": -1.0350639820098877, + "objective/rlhf_reward": -3.7402558535337445, + "objective/scores": 0.1, + "policy/approxkl_avg": 192.456298828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5784432888031006, + "step": 395, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983556270599365 + }, + { + "episode": 6352, + "epoch": 0.11417478520329295, + "loss/policy_avg": 0.008675817400217056, + "lr": 2.924079754601227e-06, + "objective/entropy": 14.630111694335938, + "objective/kl": 11.06743049621582, + "objective/non_score_reward": -1.1067430973052979, + "objective/rlhf_reward": -4.026972463726997, + "objective/scores": 0.1, + "policy/approxkl_avg": 110.01115417480469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9663007259368896, + "step": 396, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992257356643677 + }, + { + "episode": 6368, + "epoch": 0.11446237912068159, + "loss/policy_avg": 0.22570039331912994, + "lr": 2.923888036809816e-06, + "objective/entropy": -62.594932556152344, + "objective/kl": 11.640623092651367, + "objective/non_score_reward": -1.1640625, + "objective/rlhf_reward": -2.533543454782043, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 53.970237731933594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6753679513931274, + "step": 397, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984841346740723 + }, + { + "episode": 6384, + "epoch": 0.11474997303807025, + "loss/policy_avg": 0.18725144863128662, + "lr": 2.923696319018405e-06, + "objective/entropy": -118.94644165039062, + "objective/kl": 10.655905723571777, + "objective/non_score_reward": -1.0655906200408936, + "objective/rlhf_reward": -3.8623625993728634, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.7579345703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.645980954170227, + "step": 398, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001778602600098 + }, + { + "episode": 6400, + "epoch": 0.11503756695545889, + "loss/policy_avg": 0.04687364026904106, + "lr": 2.923504601226994e-06, + "objective/entropy": 107.14441680908203, + "objective/kl": 5.046243667602539, + "objective/non_score_reward": -0.5046243071556091, + "objective/rlhf_reward": 2.381502674520016, + "objective/scores": 1.1, + "policy/approxkl_avg": 28.62555503845215, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6493499279022217, + "step": 399, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00180721282959 + }, + { + "episode": 6416, + "epoch": 0.11532516087284754, + "loss/policy_avg": 0.18883462250232697, + "lr": 2.9233128834355827e-06, + "objective/entropy": 93.51673126220703, + "objective/kl": 9.593416213989258, + "objective/non_score_reward": -0.9593416452407837, + "objective/rlhf_reward": -1.4373664021492005, + "objective/scores": 0.6, + "policy/approxkl_avg": 43.221275329589844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5561137199401855, + "step": 400, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976122379302979 + }, + { + "episode": 6432, + "epoch": 0.11561275479023618, + "loss/policy_avg": 0.07294710725545883, + "lr": 2.923121165644172e-06, + "objective/entropy": 41.92622375488281, + "objective/kl": 10.513269424438477, + "objective/non_score_reward": -1.0513269901275635, + "objective/rlhf_reward": -6.205307960510254, + "objective/scores": -0.5, + "policy/approxkl_avg": 136.67205810546875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.40027546882629395, + "step": 401, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000309944152832 + }, + { + "episode": 6448, + "epoch": 0.11590034870762483, + "loss/policy_avg": -0.12561793625354767, + "lr": 2.9229294478527608e-06, + "objective/entropy": 82.13600158691406, + "objective/kl": 7.441690444946289, + "objective/non_score_reward": -0.7441689968109131, + "objective/rlhf_reward": -0.05295715177175664, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 81.62566375732422, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7616924047470093, + "step": 402, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0073320865631104 + }, + { + "episode": 6464, + "epoch": 0.11618794262501349, + "loss/policy_avg": 0.19540420174598694, + "lr": 2.92273773006135e-06, + "objective/entropy": -97.04734802246094, + "objective/kl": 9.546144485473633, + "objective/non_score_reward": -0.954614520072937, + "objective/rlhf_reward": -5.818458080291748, + "objective/scores": -0.5, + "policy/approxkl_avg": 41.53440475463867, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6575814485549927, + "step": 403, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998401403427124 + }, + { + "episode": 6480, + "epoch": 0.11647553654240213, + "loss/policy_avg": 0.2724316716194153, + "lr": 2.922546012269939e-06, + "objective/entropy": -222.66941833496094, + "objective/kl": 6.35382080078125, + "objective/non_score_reward": -0.6353820562362671, + "objective/rlhf_reward": -2.141528314352035, + "objective/scores": 0.1, + "policy/approxkl_avg": 45.857295989990234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5307646989822388, + "step": 404, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0005037784576416 + }, + { + "episode": 6496, + "epoch": 0.11676313045979078, + "loss/policy_avg": -0.19320714473724365, + "lr": 2.9223542944785276e-06, + "objective/entropy": 128.687744140625, + "objective/kl": 9.543111801147461, + "objective/non_score_reward": -0.954311192035675, + "objective/rlhf_reward": -2.2131247407832912, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 83.3912353515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5714800953865051, + "step": 405, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.014221668243408 + }, + { + "episode": 6512, + "epoch": 0.11705072437717942, + "loss/policy_avg": -0.02320697158575058, + "lr": 2.922162576687117e-06, + "objective/entropy": -111.4472427368164, + "objective/kl": 14.546286582946777, + "objective/non_score_reward": -1.4546287059783936, + "objective/rlhf_reward": -5.418514943122863, + "objective/scores": 0.1, + "policy/approxkl_avg": 198.98716735839844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6330698728561401, + "step": 406, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000318765640259 + }, + { + "episode": 6528, + "epoch": 0.11733831829456808, + "loss/policy_avg": -0.05689065158367157, + "lr": 2.9219708588957057e-06, + "objective/entropy": -94.29441833496094, + "objective/kl": 6.670037746429443, + "objective/non_score_reward": -0.6670037508010864, + "objective/rlhf_reward": 1.731984862685204, + "objective/scores": 1.1, + "policy/approxkl_avg": 58.32909393310547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4962747097015381, + "step": 407, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0033135414123535 + }, + { + "episode": 6544, + "epoch": 0.11762591221195672, + "loss/policy_avg": 0.058236684650182724, + "lr": 2.9217791411042945e-06, + "objective/entropy": 73.15773010253906, + "objective/kl": 10.113113403320312, + "objective/non_score_reward": -1.011311411857605, + "objective/rlhf_reward": -3.6452456325292584, + "objective/scores": 0.1, + "policy/approxkl_avg": 100.70352172851562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4618947505950928, + "step": 408, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986381530761719 + }, + { + "episode": 6560, + "epoch": 0.11791350612934537, + "loss/policy_avg": 0.2816429138183594, + "lr": 2.9215874233128837e-06, + "objective/entropy": 199.59906005859375, + "objective/kl": 8.274097442626953, + "objective/non_score_reward": -0.8274096846580505, + "objective/rlhf_reward": -5.309638977050781, + "objective/scores": -0.5, + "policy/approxkl_avg": 47.900413513183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4910353124141693, + "step": 409, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000108480453491 + }, + { + "episode": 6576, + "epoch": 0.11820110004673401, + "loss/policy_avg": 0.43750858306884766, + "lr": 2.9213957055214725e-06, + "objective/entropy": 63.67340087890625, + "objective/kl": 17.44285011291504, + "objective/non_score_reward": -1.7442851066589355, + "objective/rlhf_reward": -4.577140188217163, + "objective/scores": 0.6, + "policy/approxkl_avg": 116.11654663085938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7501018047332764, + "step": 410, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976780414581299 + }, + { + "episode": 6592, + "epoch": 0.11848869396412266, + "loss/policy_avg": 0.32210612297058105, + "lr": 2.9212039877300618e-06, + "objective/entropy": -220.1966552734375, + "objective/kl": 11.985102653503418, + "objective/non_score_reward": -1.1985102891921997, + "objective/rlhf_reward": -4.39404130578041, + "objective/scores": 0.1, + "policy/approxkl_avg": 161.2996826171875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5652351379394531, + "step": 411, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979429244995117 + }, + { + "episode": 6608, + "epoch": 0.1187762878815113, + "loss/policy_avg": 0.1565137803554535, + "lr": 2.92101226993865e-06, + "objective/entropy": -16.16411781311035, + "objective/kl": 14.623466491699219, + "objective/non_score_reward": -1.4623467922210693, + "objective/rlhf_reward": -7.849387168884277, + "objective/scores": -0.5, + "policy/approxkl_avg": 106.36109924316406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5213359594345093, + "step": 412, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997380256652832 + }, + { + "episode": 6624, + "epoch": 0.11906388179889996, + "loss/policy_avg": 0.16901980340480804, + "lr": 2.9208205521472394e-06, + "objective/entropy": -32.831336975097656, + "objective/kl": 9.713844299316406, + "objective/non_score_reward": -0.9713844060897827, + "objective/rlhf_reward": -1.9381265444325761, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 27.6256160736084, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5835360288619995, + "step": 413, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000209093093872 + }, + { + "episode": 6640, + "epoch": 0.1193514757162886, + "loss/policy_avg": -0.03889453411102295, + "lr": 2.920628834355828e-06, + "objective/entropy": 3.4859085083007812, + "objective/kl": 7.356728553771973, + "objective/non_score_reward": -0.7356729507446289, + "objective/rlhf_reward": -1.3385716415086562, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 150.19898986816406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6149972677230835, + "step": 414, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001542568206787 + }, + { + "episode": 6656, + "epoch": 0.11963906963367725, + "loss/policy_avg": 0.07131887972354889, + "lr": 2.920437116564417e-06, + "objective/entropy": 135.44573974609375, + "objective/kl": 6.854515075683594, + "objective/non_score_reward": -0.6854515075683594, + "objective/rlhf_reward": -4.7418060302734375, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.425703048706055, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6121791005134583, + "step": 415, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997809886932373 + }, + { + "episode": 6672, + "epoch": 0.11992666355106589, + "loss/policy_avg": -0.02449220046401024, + "lr": 2.9202453987730062e-06, + "objective/entropy": -228.9163818359375, + "objective/kl": 3.998107433319092, + "objective/non_score_reward": -0.3998107314109802, + "objective/rlhf_reward": 0.8007570147514342, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.9464802742004395, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6649434566497803, + "step": 416, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998199939727783 + }, + { + "episode": 6688, + "epoch": 0.12021425746845454, + "loss/policy_avg": 0.1401190161705017, + "lr": 2.920053680981595e-06, + "objective/entropy": 109.60284423828125, + "objective/kl": 11.780817031860352, + "objective/non_score_reward": -1.178081750869751, + "objective/rlhf_reward": -6.712327003479004, + "objective/scores": -0.5, + "policy/approxkl_avg": 101.54756927490234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6730775833129883, + "step": 417, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986844062805176 + }, + { + "episode": 6704, + "epoch": 0.12050185138584318, + "loss/policy_avg": -0.057150907814502716, + "lr": 2.919861963190184e-06, + "objective/entropy": 114.74760437011719, + "objective/kl": 13.613632202148438, + "objective/non_score_reward": -1.361363172531128, + "objective/rlhf_reward": -7.44545316696167, + "objective/scores": -0.5, + "policy/approxkl_avg": 83.6063232421875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7602818012237549, + "step": 418, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99805748462677 + }, + { + "episode": 6720, + "epoch": 0.12078944530323184, + "loss/policy_avg": 0.33992117643356323, + "lr": 2.919670245398773e-06, + "objective/entropy": 95.6858139038086, + "objective/kl": 13.743134498596191, + "objective/non_score_reward": -1.3743133544921875, + "objective/rlhf_reward": -3.3745473048844676, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 54.830780029296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.745945930480957, + "step": 419, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975709915161133 + }, + { + "episode": 6736, + "epoch": 0.12107703922062048, + "loss/policy_avg": 0.04007640480995178, + "lr": 2.919478527607362e-06, + "objective/entropy": 158.9964599609375, + "objective/kl": 11.750536918640137, + "objective/non_score_reward": -1.175053596496582, + "objective/rlhf_reward": -4.300214721262455, + "objective/scores": 0.1, + "policy/approxkl_avg": 262.0268249511719, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.691627025604248, + "step": 420, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001215934753418 + }, + { + "episode": 6752, + "epoch": 0.12136463313800913, + "loss/policy_avg": 0.3095911741256714, + "lr": 2.919286809815951e-06, + "objective/entropy": -35.89824676513672, + "objective/kl": 11.786452293395996, + "objective/non_score_reward": -1.178645372390747, + "objective/rlhf_reward": -4.314581578969955, + "objective/scores": 0.1, + "policy/approxkl_avg": 36.27446746826172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8371065855026245, + "step": 421, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9973646402359009 + }, + { + "episode": 6768, + "epoch": 0.12165222705539779, + "loss/policy_avg": 0.22105564177036285, + "lr": 2.91909509202454e-06, + "objective/entropy": 15.297624588012695, + "objective/kl": 9.64991283416748, + "objective/non_score_reward": -0.9649913311004639, + "objective/rlhf_reward": -3.459965533018112, + "objective/scores": 0.1, + "policy/approxkl_avg": 37.608909606933594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6050717830657959, + "step": 422, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99948251247406 + }, + { + "episode": 6784, + "epoch": 0.12193982097278643, + "loss/policy_avg": 0.18998616933822632, + "lr": 2.9189033742331287e-06, + "objective/entropy": -28.578590393066406, + "objective/kl": 5.720818519592285, + "objective/non_score_reward": -0.5720819234848022, + "objective/rlhf_reward": -0.3409164053963978, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.9535547494888306, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5882730484008789, + "step": 423, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995017051696777 + }, + { + "episode": 6800, + "epoch": 0.12222741489017508, + "loss/policy_avg": 0.6313632726669312, + "lr": 2.918711656441718e-06, + "objective/entropy": 219.71212768554688, + "objective/kl": 10.877071380615234, + "objective/non_score_reward": -1.0877070426940918, + "objective/rlhf_reward": -6.350828170776367, + "objective/scores": -0.5, + "policy/approxkl_avg": 94.11060333251953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6537734270095825, + "step": 424, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975165128707886 + }, + { + "episode": 6816, + "epoch": 0.12251500880756372, + "loss/policy_avg": 0.024708323180675507, + "lr": 2.918519938650307e-06, + "objective/entropy": -79.56177520751953, + "objective/kl": 16.1546573638916, + "objective/non_score_reward": -1.615465760231018, + "objective/rlhf_reward": -2.0618630260229107, + "objective/scores": 1.1, + "policy/approxkl_avg": 151.0323028564453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7285983562469482, + "step": 425, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983255863189697 + }, + { + "episode": 6832, + "epoch": 0.12280260272495237, + "loss/policy_avg": 0.07644349336624146, + "lr": 2.918328220858896e-06, + "objective/entropy": -197.13235473632812, + "objective/kl": 7.779457092285156, + "objective/non_score_reward": -0.7779456973075867, + "objective/rlhf_reward": -1.5555234988599567, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 37.899497985839844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6256439685821533, + "step": 426, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981521368026733 + }, + { + "episode": 6848, + "epoch": 0.12309019664234101, + "loss/policy_avg": 0.26216840744018555, + "lr": 2.918136503067485e-06, + "objective/entropy": 4.240196228027344, + "objective/kl": 9.129347801208496, + "objective/non_score_reward": -0.9129348397254944, + "objective/rlhf_reward": -5.651739120483398, + "objective/scores": -0.5, + "policy/approxkl_avg": 51.532691955566406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6356798410415649, + "step": 427, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993443489074707 + }, + { + "episode": 6864, + "epoch": 0.12337779055972967, + "loss/policy_avg": 0.08391077816486359, + "lr": 2.9179447852760736e-06, + "objective/entropy": 116.065185546875, + "objective/kl": 12.789249420166016, + "objective/non_score_reward": -1.2789249420166016, + "objective/rlhf_reward": -4.71570006608963, + "objective/scores": 0.1, + "policy/approxkl_avg": 107.84225463867188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6037728786468506, + "step": 428, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9945564270019531 + }, + { + "episode": 6880, + "epoch": 0.1236653844771183, + "loss/policy_avg": 0.5531384944915771, + "lr": 2.917753067484663e-06, + "objective/entropy": 259.3071594238281, + "objective/kl": 14.592714309692383, + "objective/non_score_reward": -1.4592714309692383, + "objective/rlhf_reward": -4.175226414174425, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 109.08944702148438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6318076848983765, + "step": 429, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996614456176758 + }, + { + "episode": 6896, + "epoch": 0.12395297839450696, + "loss/policy_avg": 0.21222534775733948, + "lr": 2.9175613496932517e-06, + "objective/entropy": -58.68345642089844, + "objective/kl": 11.372722625732422, + "objective/non_score_reward": -1.1372722387313843, + "objective/rlhf_reward": -1.625370000244352, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 109.58206939697266, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.47258567810058594, + "step": 430, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987989664077759 + }, + { + "episode": 6912, + "epoch": 0.1242405723118956, + "loss/policy_avg": 0.22482730448246002, + "lr": 2.9173696319018405e-06, + "objective/entropy": 67.2789535522461, + "objective/kl": 9.9976806640625, + "objective/non_score_reward": -0.9997680187225342, + "objective/rlhf_reward": -3.5990721903741356, + "objective/scores": 0.1, + "policy/approxkl_avg": 59.97844696044922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5720049142837524, + "step": 431, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989635944366455 + }, + { + "episode": 6928, + "epoch": 0.12452816622928425, + "loss/policy_avg": 0.4322272539138794, + "lr": 2.9171779141104297e-06, + "objective/entropy": -290.14111328125, + "objective/kl": 14.833064079284668, + "objective/non_score_reward": -1.4833064079284668, + "objective/rlhf_reward": -4.108396942886422, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 118.15055847167969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7155340313911438, + "step": 432, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9969213008880615 + }, + { + "episode": 6944, + "epoch": 0.1248157601466729, + "loss/policy_avg": 0.8206951022148132, + "lr": 2.9169861963190185e-06, + "objective/entropy": 81.16971588134766, + "objective/kl": 12.39436149597168, + "objective/non_score_reward": -1.2394360303878784, + "objective/rlhf_reward": -0.55774433016777, + "objective/scores": 1.1, + "policy/approxkl_avg": 56.21806335449219, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9199215173721313, + "step": 433, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997793436050415 + }, + { + "episode": 6960, + "epoch": 0.12510335406406153, + "loss/policy_avg": 0.12932038307189941, + "lr": 2.9167944785276073e-06, + "objective/entropy": -108.72267150878906, + "objective/kl": 9.69011116027832, + "objective/non_score_reward": -0.969011127948761, + "objective/rlhf_reward": -3.476044631004333, + "objective/scores": 0.1, + "policy/approxkl_avg": 113.92694091796875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7485846281051636, + "step": 434, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000093936920166 + }, + { + "episode": 6976, + "epoch": 0.1253909479814502, + "loss/policy_avg": 0.25388216972351074, + "lr": 2.916602760736196e-06, + "objective/entropy": 131.2146453857422, + "objective/kl": 12.779239654541016, + "objective/non_score_reward": -1.2779240608215332, + "objective/rlhf_reward": -7.111696243286133, + "objective/scores": -0.5, + "policy/approxkl_avg": 195.51593017578125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3997848629951477, + "step": 435, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982001781463623 + }, + { + "episode": 6992, + "epoch": 0.12567854189883884, + "loss/policy_avg": 0.34683698415756226, + "lr": 2.9164110429447854e-06, + "objective/entropy": 94.69212341308594, + "objective/kl": 8.857991218566895, + "objective/non_score_reward": -0.8857991695404053, + "objective/rlhf_reward": -5.543196678161621, + "objective/scores": -0.5, + "policy/approxkl_avg": 67.66943359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6433195471763611, + "step": 436, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968899488449097 + }, + { + "episode": 7008, + "epoch": 0.12596613581622748, + "loss/policy_avg": 0.4975810945034027, + "lr": 2.916219325153374e-06, + "objective/entropy": 99.60516357421875, + "objective/kl": 17.621997833251953, + "objective/non_score_reward": -1.762199878692627, + "objective/rlhf_reward": -5.223970900448869, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 126.27960205078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9998393654823303, + "step": 437, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977905750274658 + }, + { + "episode": 7024, + "epoch": 0.12625372973361612, + "loss/policy_avg": -0.40271705389022827, + "lr": 2.916027607361963e-06, + "objective/entropy": -58.73802947998047, + "objective/kl": 11.266298294067383, + "objective/non_score_reward": -1.1266300678253174, + "objective/rlhf_reward": -2.68169152286918, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 36.971275329589844, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5164896249771118, + "step": 438, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0037713050842285 + }, + { + "episode": 7040, + "epoch": 0.1265413236510048, + "loss/policy_avg": 0.21917138993740082, + "lr": 2.9158358895705522e-06, + "objective/entropy": -138.54464721679688, + "objective/kl": 17.991540908813477, + "objective/non_score_reward": -1.799154281616211, + "objective/rlhf_reward": -5.073910774961982, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 85.51068878173828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4984607994556427, + "step": 439, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979517459869385 + }, + { + "episode": 7056, + "epoch": 0.12682891756839343, + "loss/policy_avg": 0.14226309955120087, + "lr": 2.915644171779141e-06, + "objective/entropy": -70.7157974243164, + "objective/kl": 17.338790893554688, + "objective/non_score_reward": -1.7338790893554688, + "objective/rlhf_reward": -4.535516625642776, + "objective/scores": 0.6, + "policy/approxkl_avg": 281.78021240234375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6945629119873047, + "step": 440, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976835250854492 + }, + { + "episode": 7072, + "epoch": 0.12711651148578207, + "loss/policy_avg": 0.7052698135375977, + "lr": 2.9154524539877303e-06, + "objective/entropy": -51.95797348022461, + "objective/kl": 11.16838264465332, + "objective/non_score_reward": -1.1168383359909058, + "objective/rlhf_reward": -4.067353239655494, + "objective/scores": 0.1, + "policy/approxkl_avg": 145.99929809570312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7760474681854248, + "step": 441, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9976942539215088 + }, + { + "episode": 7088, + "epoch": 0.12740410540317074, + "loss/policy_avg": 0.2039000242948532, + "lr": 2.915260736196319e-06, + "objective/entropy": 70.03821563720703, + "objective/kl": 10.011724472045898, + "objective/non_score_reward": -1.0011723041534424, + "objective/rlhf_reward": -3.604689425230026, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.66981506347656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5425417423248291, + "step": 442, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985206127166748 + }, + { + "episode": 7104, + "epoch": 0.12769169932055938, + "loss/policy_avg": 0.046909917145967484, + "lr": 2.915069018404908e-06, + "objective/entropy": -109.3448486328125, + "objective/kl": 10.856573104858398, + "objective/non_score_reward": -1.0856573581695557, + "objective/rlhf_reward": -3.9426295816898342, + "objective/scores": 0.1, + "policy/approxkl_avg": 99.86489868164062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5526108741760254, + "step": 443, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977734088897705 + }, + { + "episode": 7120, + "epoch": 0.12797929323794802, + "loss/policy_avg": 0.3472464084625244, + "lr": 2.914877300613497e-06, + "objective/entropy": 133.83642578125, + "objective/kl": 8.187213897705078, + "objective/non_score_reward": -0.8187214136123657, + "objective/rlhf_reward": -1.6130260578995808, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 60.895503997802734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6827129125595093, + "step": 444, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999275207519531 + }, + { + "episode": 7136, + "epoch": 0.12826688715533666, + "loss/policy_avg": 0.030289731919765472, + "lr": 2.914685582822086e-06, + "objective/entropy": -95.79446411132812, + "objective/kl": 12.967110633850098, + "objective/non_score_reward": -1.2967112064361572, + "objective/rlhf_reward": -4.786844870448112, + "objective/scores": 0.1, + "policy/approxkl_avg": 158.4815673828125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6717748641967773, + "step": 445, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999089241027832 + }, + { + "episode": 7152, + "epoch": 0.12855448107272532, + "loss/policy_avg": 0.3508151173591614, + "lr": 2.9144938650306748e-06, + "objective/entropy": -29.020809173583984, + "objective/kl": 10.402070045471191, + "objective/non_score_reward": -1.0402069091796875, + "objective/rlhf_reward": -1.2371087416422095, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 22.769393920898438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6285759210586548, + "step": 446, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999580383300781 + }, + { + "episode": 7168, + "epoch": 0.12884207499011396, + "loss/policy_avg": 0.5095102190971375, + "lr": 2.914302147239264e-06, + "objective/entropy": 4.9149627685546875, + "objective/kl": 11.270059585571289, + "objective/non_score_reward": -1.1270060539245605, + "objective/rlhf_reward": -0.10802436470985377, + "objective/scores": 1.1, + "policy/approxkl_avg": 42.081207275390625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6107115745544434, + "step": 447, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997491836547852 + }, + { + "episode": 7184, + "epoch": 0.1291296689075026, + "loss/policy_avg": 0.15434984862804413, + "lr": 2.914110429447853e-06, + "objective/entropy": 93.47442626953125, + "objective/kl": 5.827376842498779, + "objective/non_score_reward": -0.5827376842498779, + "objective/rlhf_reward": -4.330950736999512, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.859039306640625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5905852913856506, + "step": 448, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985287189483643 + }, + { + "episode": 7200, + "epoch": 0.12941726282489124, + "loss/policy_avg": 0.05858859419822693, + "lr": 2.913918711656442e-06, + "objective/entropy": -168.11392211914062, + "objective/kl": 12.220864295959473, + "objective/non_score_reward": -1.2220864295959473, + "objective/rlhf_reward": -6.888345718383789, + "objective/scores": -0.5, + "policy/approxkl_avg": 64.31013488769531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6738055348396301, + "step": 449, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984550476074219 + }, + { + "episode": 7216, + "epoch": 0.1297048567422799, + "loss/policy_avg": 0.6401174068450928, + "lr": 2.913726993865031e-06, + "objective/entropy": 143.6673126220703, + "objective/kl": 16.03870964050293, + "objective/non_score_reward": -1.6038709878921509, + "objective/rlhf_reward": -6.015483951568603, + "objective/scores": 0.1, + "policy/approxkl_avg": 215.43011474609375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.852541446685791, + "step": 450, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9969959259033203 + }, + { + "episode": 7232, + "epoch": 0.12999245065966855, + "loss/policy_avg": -0.20923450589179993, + "lr": 2.9135352760736196e-06, + "objective/entropy": -48.08766174316406, + "objective/kl": 3.6980080604553223, + "objective/non_score_reward": -0.36980074644088745, + "objective/rlhf_reward": -3.479203224182129, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.520061492919922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49264973402023315, + "step": 451, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0105478763580322 + }, + { + "episode": 7248, + "epoch": 0.1302800445770572, + "loss/policy_avg": 0.2910463511943817, + "lr": 2.913343558282209e-06, + "objective/entropy": -25.366256713867188, + "objective/kl": 18.143402099609375, + "objective/non_score_reward": -1.814340353012085, + "objective/rlhf_reward": -9.25736141204834, + "objective/scores": -0.5, + "policy/approxkl_avg": 229.06854248046875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5178902745246887, + "step": 452, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988359212875366 + }, + { + "episode": 7264, + "epoch": 0.13056763849444583, + "loss/policy_avg": 0.019646476954221725, + "lr": 2.9131518404907977e-06, + "objective/entropy": 163.14837646484375, + "objective/kl": 15.703920364379883, + "objective/non_score_reward": -1.5703920125961304, + "objective/rlhf_reward": -4.548234538237253, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 97.07273864746094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.629855751991272, + "step": 453, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000038146972656 + }, + { + "episode": 7280, + "epoch": 0.1308552324118345, + "loss/policy_avg": 0.03351786732673645, + "lr": 2.912960122699387e-06, + "objective/entropy": 106.53297424316406, + "objective/kl": 15.689282417297363, + "objective/non_score_reward": -1.5689281225204468, + "objective/rlhf_reward": -8.275712966918945, + "objective/scores": -0.5, + "policy/approxkl_avg": 168.91738891601562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5836728811264038, + "step": 454, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0003838539123535 + }, + { + "episode": 7296, + "epoch": 0.13114282632922314, + "loss/policy_avg": 0.025853008031845093, + "lr": 2.9127684049079757e-06, + "objective/entropy": -120.41839599609375, + "objective/kl": 14.210792541503906, + "objective/non_score_reward": -1.4210792779922485, + "objective/rlhf_reward": -4.022457575023758, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 25.721412658691406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5456880331039429, + "step": 455, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998733639717102 + }, + { + "episode": 7312, + "epoch": 0.13143042024661178, + "loss/policy_avg": 0.11350809037685394, + "lr": 2.912576687116564e-06, + "objective/entropy": 95.34707641601562, + "objective/kl": 14.430747985839844, + "objective/non_score_reward": -1.4430747032165527, + "objective/rlhf_reward": -5.3722985744476315, + "objective/scores": 0.1, + "policy/approxkl_avg": 132.41342163085938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5699273347854614, + "step": 456, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.995617151260376 + }, + { + "episode": 7328, + "epoch": 0.13171801416400042, + "loss/policy_avg": 0.5415328741073608, + "lr": 2.9123849693251534e-06, + "objective/entropy": -35.476009368896484, + "objective/kl": 8.802606582641602, + "objective/non_score_reward": -0.880260705947876, + "objective/rlhf_reward": -3.1210429728031155, + "objective/scores": 0.1, + "policy/approxkl_avg": 70.38357543945312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6030310988426208, + "step": 457, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000958204269409 + }, + { + "episode": 7344, + "epoch": 0.1320056080813891, + "loss/policy_avg": 0.18966403603553772, + "lr": 2.912193251533742e-06, + "objective/entropy": 160.48800659179688, + "objective/kl": 14.643467903137207, + "objective/non_score_reward": -1.4643468856811523, + "objective/rlhf_reward": -1.45738730430603, + "objective/scores": 1.1, + "policy/approxkl_avg": 121.34632110595703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7794003486633301, + "step": 458, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9975476264953613 + }, + { + "episode": 7360, + "epoch": 0.13229320199877773, + "loss/policy_avg": 0.038652483373880386, + "lr": 2.9120015337423314e-06, + "objective/entropy": -118.91299438476562, + "objective/kl": 8.336200714111328, + "objective/non_score_reward": -0.8336200714111328, + "objective/rlhf_reward": -1.7303604072967347, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 37.65449523925781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7707014083862305, + "step": 459, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998978614807129 + }, + { + "episode": 7376, + "epoch": 0.13258079591616637, + "loss/policy_avg": 0.2408444583415985, + "lr": 2.91180981595092e-06, + "objective/entropy": 8.406875610351562, + "objective/kl": 12.288520812988281, + "objective/non_score_reward": -1.2288521528244019, + "objective/rlhf_reward": -6.915408134460449, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.923194885253906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7405025959014893, + "step": 460, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999840497970581 + }, + { + "episode": 7392, + "epoch": 0.13286838983355503, + "loss/policy_avg": 0.15941619873046875, + "lr": 2.911618098159509e-06, + "objective/entropy": 113.75840759277344, + "objective/kl": 7.790184020996094, + "objective/non_score_reward": -0.7790184617042542, + "objective/rlhf_reward": -2.7160738766193386, + "objective/scores": 0.1, + "policy/approxkl_avg": 42.8814697265625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7081342935562134, + "step": 461, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986152648925781 + }, + { + "episode": 7408, + "epoch": 0.13315598375094367, + "loss/policy_avg": 0.012417584657669067, + "lr": 2.9114263803680982e-06, + "objective/entropy": -38.02949142456055, + "objective/kl": 7.286294937133789, + "objective/non_score_reward": -0.728629469871521, + "objective/rlhf_reward": -4.914518356323242, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.29476547241211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.38131576776504517, + "step": 462, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974234104156494 + }, + { + "episode": 7424, + "epoch": 0.1334435776683323, + "loss/policy_avg": -0.01223007496446371, + "lr": 2.911234662576687e-06, + "objective/entropy": -89.77444458007812, + "objective/kl": 6.793033599853516, + "objective/non_score_reward": -0.6793034076690674, + "objective/rlhf_reward": -2.3172135859727856, + "objective/scores": 0.1, + "policy/approxkl_avg": 0.7712490558624268, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5063762664794922, + "step": 463, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0153064727783203 + }, + { + "episode": 7440, + "epoch": 0.13373117158572095, + "loss/policy_avg": 0.7638596892356873, + "lr": 2.9110429447852763e-06, + "objective/entropy": -124.14325714111328, + "objective/kl": 12.126808166503906, + "objective/non_score_reward": -1.2126808166503906, + "objective/rlhf_reward": -4.450723177194595, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.479761123657227, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6185396313667297, + "step": 464, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9997889995574951 + }, + { + "episode": 7456, + "epoch": 0.13401876550310962, + "loss/policy_avg": -0.23089107871055603, + "lr": 2.910851226993865e-06, + "objective/entropy": 12.087081909179688, + "objective/kl": 9.909775733947754, + "objective/non_score_reward": -0.9909776449203491, + "objective/rlhf_reward": 0.4360893458127979, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.47049713134766, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6658675670623779, + "step": 465, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0070953369140625 + }, + { + "episode": 7472, + "epoch": 0.13430635942049826, + "loss/policy_avg": 0.0948818027973175, + "lr": 2.910659509202454e-06, + "objective/entropy": 121.13216400146484, + "objective/kl": 3.1745963096618652, + "objective/non_score_reward": -0.3174596130847931, + "objective/rlhf_reward": 0.33428153776733316, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.6572422981262207, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6701205372810364, + "step": 466, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000531673431396 + }, + { + "episode": 7488, + "epoch": 0.1345939533378869, + "loss/policy_avg": 0.06078179180622101, + "lr": 2.910467791411043e-06, + "objective/entropy": 109.86882019042969, + "objective/kl": 11.02462387084961, + "objective/non_score_reward": -1.1024622917175293, + "objective/rlhf_reward": -4.009849047660827, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.87171936035156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5864859819412231, + "step": 467, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973387718200684 + }, + { + "episode": 7504, + "epoch": 0.13488154725527554, + "loss/policy_avg": 0.3744744658470154, + "lr": 2.910276073619632e-06, + "objective/entropy": 224.23721313476562, + "objective/kl": 11.501972198486328, + "objective/non_score_reward": -1.1501970291137695, + "objective/rlhf_reward": -4.200788414478302, + "objective/scores": 0.1, + "policy/approxkl_avg": 118.61778259277344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9559670090675354, + "step": 468, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999631643295288 + }, + { + "episode": 7520, + "epoch": 0.1351691411726642, + "loss/policy_avg": 0.3056102395057678, + "lr": 2.9100843558282208e-06, + "objective/entropy": -3.655149459838867, + "objective/kl": 9.39276123046875, + "objective/non_score_reward": -0.9392762184143066, + "objective/rlhf_reward": 0.642895066738129, + "objective/scores": 1.1, + "policy/approxkl_avg": 105.86042785644531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5348349809646606, + "step": 469, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986598491668701 + }, + { + "episode": 7536, + "epoch": 0.13545673509005285, + "loss/policy_avg": 0.26237964630126953, + "lr": 2.90989263803681e-06, + "objective/entropy": -334.96240234375, + "objective/kl": 11.451269149780273, + "objective/non_score_reward": -1.1451269388198853, + "objective/rlhf_reward": -0.18050771057605708, + "objective/scores": 1.1, + "policy/approxkl_avg": 53.953277587890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8247783184051514, + "step": 470, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9965124130249023 + }, + { + "episode": 7552, + "epoch": 0.1357443290074415, + "loss/policy_avg": 0.2353116273880005, + "lr": 2.909700920245399e-06, + "objective/entropy": -169.8182373046875, + "objective/kl": 14.575862884521484, + "objective/non_score_reward": -1.4575862884521484, + "objective/rlhf_reward": -1.430345384776592, + "objective/scores": 1.1, + "policy/approxkl_avg": 134.40069580078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6239136457443237, + "step": 471, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998337745666504 + }, + { + "episode": 7568, + "epoch": 0.13603192292483013, + "loss/policy_avg": 0.39453232288360596, + "lr": 2.909509202453988e-06, + "objective/entropy": 88.90142822265625, + "objective/kl": 10.912429809570312, + "objective/non_score_reward": -1.091243028640747, + "objective/rlhf_reward": 0.03502755761146581, + "objective/scores": 1.1, + "policy/approxkl_avg": 41.724300384521484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8498834371566772, + "step": 472, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997682571411133 + }, + { + "episode": 7584, + "epoch": 0.1363195168422188, + "loss/policy_avg": 0.8089221119880676, + "lr": 2.909317484662577e-06, + "objective/entropy": 245.41949462890625, + "objective/kl": 19.53654670715332, + "objective/non_score_reward": -1.9536547660827637, + "objective/rlhf_reward": -6.258360086885049, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 363.8136291503906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6021559834480286, + "step": 473, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999751091003418 + }, + { + "episode": 7600, + "epoch": 0.13660711075960744, + "loss/policy_avg": 0.5075941681861877, + "lr": 2.9091257668711657e-06, + "objective/entropy": -48.53450012207031, + "objective/kl": 5.752985000610352, + "objective/non_score_reward": -0.5752984285354614, + "objective/rlhf_reward": 2.098806151747704, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.8740129470825195, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.45245441794395447, + "step": 474, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002760887145996 + }, + { + "episode": 7616, + "epoch": 0.13689470467699608, + "loss/policy_avg": 0.11127430945634842, + "lr": 2.908934049079755e-06, + "objective/entropy": -34.70821762084961, + "objective/kl": 13.361335754394531, + "objective/non_score_reward": -1.336133599281311, + "objective/rlhf_reward": -3.2218282840409618, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 80.18733978271484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6081266403198242, + "step": 475, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994287490844727 + }, + { + "episode": 7632, + "epoch": 0.13718229859438472, + "loss/policy_avg": 0.10813181102275848, + "lr": 2.9087423312883437e-06, + "objective/entropy": 111.931640625, + "objective/kl": 6.184762954711914, + "objective/non_score_reward": -0.6184762716293335, + "objective/rlhf_reward": -0.6490763976898899, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 10.89632511138916, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4316413700580597, + "step": 476, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000701904296875 + }, + { + "episode": 7648, + "epoch": 0.13746989251177338, + "loss/policy_avg": 0.4999096989631653, + "lr": 2.908550613496933e-06, + "objective/entropy": -403.03533935546875, + "objective/kl": 9.01245403289795, + "objective/non_score_reward": -0.9012453556060791, + "objective/rlhf_reward": -5.604981422424316, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.18182373046875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5862891674041748, + "step": 477, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975061416625977 + }, + { + "episode": 7664, + "epoch": 0.13775748642916202, + "loss/policy_avg": 0.14539723098278046, + "lr": 2.9083588957055213e-06, + "objective/entropy": -126.68431091308594, + "objective/kl": 18.38888168334961, + "objective/non_score_reward": -1.8388880491256714, + "objective/rlhf_reward": -9.355552673339844, + "objective/scores": -0.5, + "policy/approxkl_avg": 197.85671997070312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6251143217086792, + "step": 478, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994767904281616 + }, + { + "episode": 7680, + "epoch": 0.13804508034655066, + "loss/policy_avg": 0.0330502912402153, + "lr": 2.9081671779141105e-06, + "objective/entropy": -44.419708251953125, + "objective/kl": 15.150094985961914, + "objective/non_score_reward": -1.5150094032287598, + "objective/rlhf_reward": -8.060037612915039, + "objective/scores": -0.5, + "policy/approxkl_avg": 151.68356323242188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7994130253791809, + "step": 479, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995399713516235 + }, + { + "episode": 7696, + "epoch": 0.13833267426393933, + "loss/policy_avg": 0.14779314398765564, + "lr": 2.9079754601226994e-06, + "objective/entropy": 105.36286926269531, + "objective/kl": 8.258722305297852, + "objective/non_score_reward": -0.8258723616600037, + "objective/rlhf_reward": -2.9034894019365307, + "objective/scores": 0.1, + "policy/approxkl_avg": 67.71802520751953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5357914566993713, + "step": 480, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9967106580734253 + }, + { + "episode": 7712, + "epoch": 0.13862026818132797, + "loss/policy_avg": 0.27561917901039124, + "lr": 2.907783742331288e-06, + "objective/entropy": -65.93132019042969, + "objective/kl": 11.474952697753906, + "objective/non_score_reward": -1.1474955081939697, + "objective/rlhf_reward": -6.589982032775879, + "objective/scores": -0.5, + "policy/approxkl_avg": 69.8592529296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.572990894317627, + "step": 481, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971063137054443 + }, + { + "episode": 7728, + "epoch": 0.1389078620987166, + "loss/policy_avg": 0.31458884477615356, + "lr": 2.9075920245398774e-06, + "objective/entropy": -12.270210266113281, + "objective/kl": 11.645172119140625, + "objective/non_score_reward": -1.1645172834396362, + "objective/rlhf_reward": -1.7343501045715537, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 109.0627670288086, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.47612980008125305, + "step": 482, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9962890148162842 + }, + { + "episode": 7744, + "epoch": 0.13919545601610525, + "loss/policy_avg": 0.6003249883651733, + "lr": 2.907400306748466e-06, + "objective/entropy": 110.20198059082031, + "objective/kl": 10.82795524597168, + "objective/non_score_reward": -1.0827956199645996, + "objective/rlhf_reward": -6.331182479858398, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.25099182128906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49656587839126587, + "step": 483, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974627494812012 + }, + { + "episode": 7760, + "epoch": 0.13948304993349392, + "loss/policy_avg": -0.30824440717697144, + "lr": 2.907208588957055e-06, + "objective/entropy": 195.908447265625, + "objective/kl": 11.146462440490723, + "objective/non_score_reward": -1.1146461963653564, + "objective/rlhf_reward": -4.058584606647491, + "objective/scores": 0.1, + "policy/approxkl_avg": 49.840267181396484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7134747505187988, + "step": 484, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003911256790161 + }, + { + "episode": 7776, + "epoch": 0.13977064385088256, + "loss/policy_avg": 0.2714402973651886, + "lr": 2.9070168711656443e-06, + "objective/entropy": 200.92269897460938, + "objective/kl": 9.644775390625, + "objective/non_score_reward": -0.9644776582717896, + "objective/rlhf_reward": 0.5420892477035526, + "objective/scores": 1.1, + "policy/approxkl_avg": 84.43876647949219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5147348642349243, + "step": 485, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978854656219482 + }, + { + "episode": 7792, + "epoch": 0.1400582377682712, + "loss/policy_avg": 0.06527984887361526, + "lr": 2.906825153374233e-06, + "objective/entropy": 108.83843994140625, + "objective/kl": 6.777806282043457, + "objective/non_score_reward": -0.6777806282043457, + "objective/rlhf_reward": 1.6888774499297146, + "objective/scores": 1.1, + "policy/approxkl_avg": 19.67444610595703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6244113445281982, + "step": 486, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988651275634766 + }, + { + "episode": 7808, + "epoch": 0.14034583168565984, + "loss/policy_avg": 0.16577281057834625, + "lr": 2.9066334355828223e-06, + "objective/entropy": 89.5904541015625, + "objective/kl": 5.467947483062744, + "objective/non_score_reward": -0.5467947125434875, + "objective/rlhf_reward": -1.7871789395809172, + "objective/scores": 0.1, + "policy/approxkl_avg": 26.6594181060791, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.589790940284729, + "step": 487, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983232021331787 + }, + { + "episode": 7824, + "epoch": 0.1406334256030485, + "loss/policy_avg": 0.04526926949620247, + "lr": 2.906441717791411e-06, + "objective/entropy": -265.4620056152344, + "objective/kl": 5.726231575012207, + "objective/non_score_reward": -0.5726232528686523, + "objective/rlhf_reward": 2.109507152438164, + "objective/scores": 1.1, + "policy/approxkl_avg": 27.101959228515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5848753452301025, + "step": 488, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989309310913086 + }, + { + "episode": 7840, + "epoch": 0.14092101952043715, + "loss/policy_avg": 0.4120207726955414, + "lr": 2.90625e-06, + "objective/entropy": -195.201171875, + "objective/kl": 11.044137001037598, + "objective/non_score_reward": -1.104413628578186, + "objective/rlhf_reward": -4.017654529213905, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.49024200439453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6557003259658813, + "step": 489, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995417594909668 + }, + { + "episode": 7856, + "epoch": 0.1412086134378258, + "loss/policy_avg": 0.07290078699588776, + "lr": 2.906058282208589e-06, + "objective/entropy": -118.10462951660156, + "objective/kl": 13.856963157653809, + "objective/non_score_reward": -1.3856964111328125, + "objective/rlhf_reward": -5.142785763740539, + "objective/scores": 0.1, + "policy/approxkl_avg": 48.285400390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5914438366889954, + "step": 490, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999072790145874 + }, + { + "episode": 7872, + "epoch": 0.14149620735521443, + "loss/policy_avg": 0.19345812499523163, + "lr": 2.905866564417178e-06, + "objective/entropy": -72.50758361816406, + "objective/kl": 10.323736190795898, + "objective/non_score_reward": -1.0323736667633057, + "objective/rlhf_reward": -3.7294944696128365, + "objective/scores": 0.1, + "policy/approxkl_avg": 89.49455261230469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6444407105445862, + "step": 491, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993422031402588 + }, + { + "episode": 7888, + "epoch": 0.1417838012726031, + "loss/policy_avg": 0.2960240840911865, + "lr": 2.905674846625767e-06, + "objective/entropy": -52.92985534667969, + "objective/kl": 10.259614944458008, + "objective/non_score_reward": -1.0259615182876587, + "objective/rlhf_reward": 0.2961540907621387, + "objective/scores": 1.1, + "policy/approxkl_avg": 101.89286804199219, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6554427742958069, + "step": 492, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968318939208984 + }, + { + "episode": 7904, + "epoch": 0.14207139518999173, + "loss/policy_avg": 0.18822041153907776, + "lr": 2.905483128834356e-06, + "objective/entropy": 212.44216918945312, + "objective/kl": 15.652924537658691, + "objective/non_score_reward": -1.5652923583984375, + "objective/rlhf_reward": -4.599310164869415, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 205.89048767089844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5811524987220764, + "step": 493, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9962797164916992 + }, + { + "episode": 7920, + "epoch": 0.14235898910738037, + "loss/policy_avg": 0.12098196893930435, + "lr": 2.905291411042945e-06, + "objective/entropy": 153.53717041015625, + "objective/kl": 20.49001693725586, + "objective/non_score_reward": -2.049001693725586, + "objective/rlhf_reward": -6.073301257864509, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 77.01814270019531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7395473718643188, + "step": 494, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977500438690186 + }, + { + "episode": 7936, + "epoch": 0.142646583024769, + "loss/policy_avg": -0.01829097419977188, + "lr": 2.905099693251534e-06, + "objective/entropy": 66.50902557373047, + "objective/kl": 10.364259719848633, + "objective/non_score_reward": -1.036426067352295, + "objective/rlhf_reward": -1.2219853147279947, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 26.25365447998047, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4803291857242584, + "step": 495, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991976022720337 + }, + { + "episode": 7952, + "epoch": 0.14293417694215768, + "loss/policy_avg": 0.10506206750869751, + "lr": 2.904907975460123e-06, + "objective/entropy": 136.27267456054688, + "objective/kl": 6.317910194396973, + "objective/non_score_reward": -0.6317909955978394, + "objective/rlhf_reward": -4.527163982391357, + "objective/scores": -0.5, + "policy/approxkl_avg": 16.371524810791016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6706969738006592, + "step": 496, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975488185882568 + }, + { + "episode": 7968, + "epoch": 0.14322177085954632, + "loss/policy_avg": 0.4831598401069641, + "lr": 2.9047162576687117e-06, + "objective/entropy": -31.924362182617188, + "objective/kl": 11.602783203125, + "objective/non_score_reward": -1.1602783203125, + "objective/rlhf_reward": -4.2411130428314205, + "objective/scores": 0.1, + "policy/approxkl_avg": 87.01671600341797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7876095771789551, + "step": 497, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99977707862854 + }, + { + "episode": 7984, + "epoch": 0.14350936477693496, + "loss/policy_avg": 1.053896427154541, + "lr": 2.904524539877301e-06, + "objective/entropy": 100.03763580322266, + "objective/kl": 9.37060260772705, + "objective/non_score_reward": -0.9370602369308472, + "objective/rlhf_reward": -1.6255346558251715, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.041189193725586, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6169747114181519, + "step": 498, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0003209114074707 + }, + { + "episode": 8000, + "epoch": 0.14379695869432363, + "loss/policy_avg": 0.15767307579517365, + "lr": 2.9043328220858897e-06, + "objective/entropy": 32.90543746948242, + "objective/kl": 7.48246955871582, + "objective/non_score_reward": -0.7482469081878662, + "objective/rlhf_reward": -1.3311283044224842, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 31.293258666992188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7192215919494629, + "step": 499, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001739978790283 + }, + { + "episode": 8016, + "epoch": 0.14408455261171227, + "loss/policy_avg": 0.3422059416770935, + "lr": 2.904141104294479e-06, + "objective/entropy": -277.40203857421875, + "objective/kl": 13.774396896362305, + "objective/non_score_reward": -1.3774398565292358, + "objective/rlhf_reward": -7.509759426116943, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.98161315917969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6245834827423096, + "step": 500, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9970264434814453 + }, + { + "episode": 8032, + "epoch": 0.1443721465291009, + "loss/policy_avg": 0.4801827669143677, + "lr": 2.9039493865030673e-06, + "objective/entropy": 83.53890228271484, + "objective/kl": 8.603704452514648, + "objective/non_score_reward": -0.8603705167770386, + "objective/rlhf_reward": -0.5177629336130348, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 94.87248992919922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7735698223114014, + "step": 501, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000962257385254 + }, + { + "episode": 8048, + "epoch": 0.14465974044648955, + "loss/policy_avg": 0.1936042606830597, + "lr": 2.9037576687116566e-06, + "objective/entropy": 16.39226531982422, + "objective/kl": 13.502885818481445, + "objective/non_score_reward": -1.3502888679504395, + "objective/rlhf_reward": -3.453743944840367, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 83.78414916992188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7048830986022949, + "step": 502, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994542598724365 + }, + { + "episode": 8064, + "epoch": 0.14494733436387822, + "loss/policy_avg": 0.1588769406080246, + "lr": 2.9035659509202454e-06, + "objective/entropy": -40.00954055786133, + "objective/kl": 18.89105987548828, + "objective/non_score_reward": -1.8891057968139648, + "objective/rlhf_reward": -4.632704351783964, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 181.8993682861328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5775229930877686, + "step": 503, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997877836227417 + }, + { + "episode": 8080, + "epoch": 0.14523492828126686, + "loss/policy_avg": 0.6248252987861633, + "lr": 2.903374233128834e-06, + "objective/entropy": -210.8176727294922, + "objective/kl": 10.124932289123535, + "objective/non_score_reward": -1.0124932527542114, + "objective/rlhf_reward": -3.649973204731941, + "objective/scores": 0.1, + "policy/approxkl_avg": 49.988563537597656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6010805368423462, + "step": 504, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989054203033447 + }, + { + "episode": 8096, + "epoch": 0.1455225221986555, + "loss/policy_avg": 0.1622433364391327, + "lr": 2.9031825153374234e-06, + "objective/entropy": 17.784866333007812, + "objective/kl": 9.719334602355957, + "objective/non_score_reward": -0.9719333648681641, + "objective/rlhf_reward": 0.5122664213180546, + "objective/scores": 1.1, + "policy/approxkl_avg": 56.30583953857422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6292353272438049, + "step": 505, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0021533966064453 + }, + { + "episode": 8112, + "epoch": 0.14581011611604414, + "loss/policy_avg": -0.1938256323337555, + "lr": 2.9029907975460122e-06, + "objective/entropy": 127.38702392578125, + "objective/kl": 5.040970802307129, + "objective/non_score_reward": -0.5040971636772156, + "objective/rlhf_reward": -1.6163885876536368, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.422080993652344, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5696605443954468, + "step": 506, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0037827491760254 + }, + { + "episode": 8128, + "epoch": 0.1460977100334328, + "loss/policy_avg": 0.38081973791122437, + "lr": 2.902799079754601e-06, + "objective/entropy": 42.37715530395508, + "objective/kl": 7.989552021026611, + "objective/non_score_reward": -0.7989552021026611, + "objective/rlhf_reward": -1.073114486710105, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 50.31044006347656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6788329482078552, + "step": 507, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99943208694458 + }, + { + "episode": 8144, + "epoch": 0.14638530395082144, + "loss/policy_avg": 0.3784164786338806, + "lr": 2.9026073619631903e-06, + "objective/entropy": -50.67826843261719, + "objective/kl": 13.639227867126465, + "objective/non_score_reward": -1.3639228343963623, + "objective/rlhf_reward": -7.455691337585449, + "objective/scores": -0.5, + "policy/approxkl_avg": 109.05412292480469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8064265847206116, + "step": 508, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993607997894287 + }, + { + "episode": 8160, + "epoch": 0.14667289786821008, + "loss/policy_avg": 0.13403823971748352, + "lr": 2.902415644171779e-06, + "objective/entropy": 96.67460632324219, + "objective/kl": 12.861349105834961, + "objective/non_score_reward": -1.2861348390579224, + "objective/rlhf_reward": -3.540419537488537, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 196.5169677734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47650325298309326, + "step": 509, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008530616760254 + }, + { + "episode": 8176, + "epoch": 0.14696049178559872, + "loss/policy_avg": 0.2548585534095764, + "lr": 2.9022239263803683e-06, + "objective/entropy": -259.3787841796875, + "objective/kl": 13.765344619750977, + "objective/non_score_reward": -1.376534342765808, + "objective/rlhf_reward": -5.106137281656265, + "objective/scores": 0.1, + "policy/approxkl_avg": 124.93861389160156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.578956663608551, + "step": 510, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9970561265945435 + }, + { + "episode": 8192, + "epoch": 0.1472480857029874, + "loss/policy_avg": 0.32806700468063354, + "lr": 2.902032208588957e-06, + "objective/entropy": -89.41710662841797, + "objective/kl": 11.446915626525879, + "objective/non_score_reward": -1.1446915864944458, + "objective/rlhf_reward": -2.7539375975456943, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 100.32626342773438, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6500498056411743, + "step": 511, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999818801879883 + }, + { + "episode": 8208, + "epoch": 0.14753567962037603, + "loss/policy_avg": 0.1273409128189087, + "lr": 2.901840490797546e-06, + "objective/entropy": 69.84151458740234, + "objective/kl": 18.043598175048828, + "objective/non_score_reward": -1.804359793663025, + "objective/rlhf_reward": -4.8174391150474545, + "objective/scores": 0.6, + "policy/approxkl_avg": 85.71723937988281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5488656163215637, + "step": 512, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998365879058838 + }, + { + "episode": 8224, + "epoch": 0.14782327353776467, + "loss/policy_avg": 1.1218316555023193, + "lr": 2.901648773006135e-06, + "objective/entropy": -103.88362121582031, + "objective/kl": 12.342060089111328, + "objective/non_score_reward": -1.2342060804367065, + "objective/rlhf_reward": -4.536824202537536, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.8322827816009521, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.735711932182312, + "step": 513, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000443458557129 + }, + { + "episode": 8240, + "epoch": 0.1481108674551533, + "loss/policy_avg": -0.014876842498779297, + "lr": 2.901457055214724e-06, + "objective/entropy": 75.55338287353516, + "objective/kl": 12.932259559631348, + "objective/non_score_reward": -1.2932257652282715, + "objective/rlhf_reward": -3.439569846789042, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 28.75712776184082, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.526807963848114, + "step": 514, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995429515838623 + }, + { + "episode": 8256, + "epoch": 0.14839846137254198, + "loss/policy_avg": 0.11381683498620987, + "lr": 2.901265337423313e-06, + "objective/entropy": 28.295608520507812, + "objective/kl": 12.695294380187988, + "objective/non_score_reward": -1.2695293426513672, + "objective/rlhf_reward": -4.678117549419403, + "objective/scores": 0.1, + "policy/approxkl_avg": 62.5381965637207, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4214191138744354, + "step": 515, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000277519226074 + }, + { + "episode": 8272, + "epoch": 0.14868605528993062, + "loss/policy_avg": 0.031907178461551666, + "lr": 2.901073619631902e-06, + "objective/entropy": 202.26161193847656, + "objective/kl": 7.357514381408691, + "objective/non_score_reward": -0.7357515096664429, + "objective/rlhf_reward": -2.5430058300495144, + "objective/scores": 0.1, + "policy/approxkl_avg": 9.950460433959961, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7936433553695679, + "step": 516, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0013363361358643 + }, + { + "episode": 8288, + "epoch": 0.14897364920731926, + "loss/policy_avg": 0.1753254234790802, + "lr": 2.900881901840491e-06, + "objective/entropy": -103.65457153320312, + "objective/kl": 12.325798034667969, + "objective/non_score_reward": -1.2325799465179443, + "objective/rlhf_reward": -2.807613672987495, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 124.17157745361328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6873651742935181, + "step": 517, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0010011196136475 + }, + { + "episode": 8304, + "epoch": 0.1492612431247079, + "loss/policy_avg": 0.49640148878097534, + "lr": 2.90069018404908e-06, + "objective/entropy": -23.451457977294922, + "objective/kl": 11.856928825378418, + "objective/non_score_reward": -1.1856927871704102, + "objective/rlhf_reward": -2.7953601283597305, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 100.8171157836914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.530035138130188, + "step": 518, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997998595237732 + }, + { + "episode": 8320, + "epoch": 0.14954883704209657, + "loss/policy_avg": -0.026080047711730003, + "lr": 2.900498466257669e-06, + "objective/entropy": -170.7906951904297, + "objective/kl": 5.801922798156738, + "objective/non_score_reward": -0.5801923871040344, + "objective/rlhf_reward": 2.0792303770780567, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.545293807983398, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4394223093986511, + "step": 519, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988439083099365 + }, + { + "episode": 8336, + "epoch": 0.1498364309594852, + "loss/policy_avg": 0.18013785779476166, + "lr": 2.9003067484662577e-06, + "objective/entropy": -30.96051788330078, + "objective/kl": 15.340864181518555, + "objective/non_score_reward": -1.5340864658355713, + "objective/rlhf_reward": -5.736345967650413, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.957459449768066, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7148102521896362, + "step": 520, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002388954162598 + }, + { + "episode": 8352, + "epoch": 0.15012402487687385, + "loss/policy_avg": 0.21995435655117035, + "lr": 2.900115030674847e-06, + "objective/entropy": 118.80613708496094, + "objective/kl": 13.875422477722168, + "objective/non_score_reward": -1.3875422477722168, + "objective/rlhf_reward": -5.150168991088867, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.807886123657227, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6429756879806519, + "step": 521, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986985921859741 + }, + { + "episode": 8368, + "epoch": 0.1504116187942625, + "loss/policy_avg": 0.44822782278060913, + "lr": 2.8999233128834357e-06, + "objective/entropy": 131.02780151367188, + "objective/kl": 8.63813304901123, + "objective/non_score_reward": -0.8638133406639099, + "objective/rlhf_reward": -3.0552533924579617, + "objective/scores": 0.1, + "policy/approxkl_avg": 54.1556510925293, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8205841779708862, + "step": 522, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984720945358276 + }, + { + "episode": 8384, + "epoch": 0.15069921271165115, + "loss/policy_avg": 0.2645617723464966, + "lr": 2.8997315950920245e-06, + "objective/entropy": -156.31088256835938, + "objective/kl": 7.035023212432861, + "objective/non_score_reward": -0.7035022974014282, + "objective/rlhf_reward": -4.814008712768555, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.48949432373047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49438732862472534, + "step": 523, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992444515228271 + }, + { + "episode": 8400, + "epoch": 0.1509868066290398, + "loss/policy_avg": 0.09130215644836426, + "lr": 2.8995398773006133e-06, + "objective/entropy": -78.74007415771484, + "objective/kl": 8.69231128692627, + "objective/non_score_reward": -0.8692312240600586, + "objective/rlhf_reward": -0.5532056882393088, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 96.52299499511719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46065402030944824, + "step": 524, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998188853263855 + }, + { + "episode": 8416, + "epoch": 0.15127440054642843, + "loss/policy_avg": 0.13580232858657837, + "lr": 2.8993481595092026e-06, + "objective/entropy": -72.97970581054688, + "objective/kl": 7.367366313934326, + "objective/non_score_reward": -0.7367366552352905, + "objective/rlhf_reward": -4.946946620941162, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.75210189819336, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7579455375671387, + "step": 525, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000337600708008 + }, + { + "episode": 8432, + "epoch": 0.1515619944638171, + "loss/policy_avg": 0.11613625288009644, + "lr": 2.8991564417177914e-06, + "objective/entropy": 83.00586700439453, + "objective/kl": 12.637749671936035, + "objective/non_score_reward": -1.2637748718261719, + "objective/rlhf_reward": -3.2302709474888553, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 89.52025604248047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5902446508407593, + "step": 526, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989619255065918 + }, + { + "episode": 8448, + "epoch": 0.15184958838120574, + "loss/policy_avg": 0.03080570697784424, + "lr": 2.89896472392638e-06, + "objective/entropy": 106.05413055419922, + "objective/kl": 8.069011688232422, + "objective/non_score_reward": -0.8069012761116028, + "objective/rlhf_reward": -0.3038861199629035, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 105.38276672363281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5903005599975586, + "step": 527, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979126453399658 + }, + { + "episode": 8464, + "epoch": 0.15213718229859438, + "loss/policy_avg": 0.3046284317970276, + "lr": 2.8987730061349694e-06, + "objective/entropy": 152.33953857421875, + "objective/kl": 14.629440307617188, + "objective/non_score_reward": -1.4629439115524292, + "objective/rlhf_reward": -4.189916288078415, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 60.608787536621094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6111043691635132, + "step": 528, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986634254455566 + }, + { + "episode": 8480, + "epoch": 0.15242477621598302, + "loss/policy_avg": 0.3153807520866394, + "lr": 2.8985812883435582e-06, + "objective/entropy": 11.483592987060547, + "objective/kl": 7.498883247375488, + "objective/non_score_reward": -0.7498883008956909, + "objective/rlhf_reward": -2.5995532035827633, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.66455841064453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.530735969543457, + "step": 529, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997819185256958 + }, + { + "episode": 8496, + "epoch": 0.1527123701333717, + "loss/policy_avg": -0.4344179630279541, + "lr": 2.8983895705521475e-06, + "objective/entropy": 172.50282287597656, + "objective/kl": 9.609460830688477, + "objective/non_score_reward": -0.9609460234642029, + "objective/rlhf_reward": -5.843784332275391, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.63162231445312, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8059056997299194, + "step": 530, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002572536468506 + }, + { + "episode": 8512, + "epoch": 0.15299996405076033, + "loss/policy_avg": 0.09429708868265152, + "lr": 2.8981978527607363e-06, + "objective/entropy": 294.640625, + "objective/kl": 18.882694244384766, + "objective/non_score_reward": -1.888269305229187, + "objective/rlhf_reward": -7.153077340126037, + "objective/scores": 0.1, + "policy/approxkl_avg": 111.17985534667969, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7101346254348755, + "step": 531, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989078044891357 + }, + { + "episode": 8528, + "epoch": 0.15328755796814897, + "loss/policy_avg": 0.15176677703857422, + "lr": 2.898006134969325e-06, + "objective/entropy": 136.55715942382812, + "objective/kl": 12.547300338745117, + "objective/non_score_reward": -1.2547301054000854, + "objective/rlhf_reward": -0.6189204812049862, + "objective/scores": 1.1, + "policy/approxkl_avg": 103.96243286132812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8805792331695557, + "step": 532, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000082015991211 + }, + { + "episode": 8544, + "epoch": 0.1535751518855376, + "loss/policy_avg": 0.2759408950805664, + "lr": 2.8978144171779143e-06, + "objective/entropy": 157.77984619140625, + "objective/kl": 8.116591453552246, + "objective/non_score_reward": -0.8116590976715088, + "objective/rlhf_reward": -5.246636390686035, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.0709867477417, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7335650324821472, + "step": 533, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999115228652954 + }, + { + "episode": 8560, + "epoch": 0.15386274580292628, + "loss/policy_avg": 0.49007368087768555, + "lr": 2.897622699386503e-06, + "objective/entropy": -148.64157104492188, + "objective/kl": 15.987449645996094, + "objective/non_score_reward": -1.5987448692321777, + "objective/rlhf_reward": -5.994979923963546, + "objective/scores": 0.1, + "policy/approxkl_avg": 167.39520263671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6771738529205322, + "step": 534, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.995469093322754 + }, + { + "episode": 8576, + "epoch": 0.15415033972031492, + "loss/policy_avg": 0.2475529909133911, + "lr": 2.897430981595092e-06, + "objective/entropy": 63.576107025146484, + "objective/kl": 8.813644409179688, + "objective/non_score_reward": -0.8813644051551819, + "objective/rlhf_reward": -3.125457620620727, + "objective/scores": 0.1, + "policy/approxkl_avg": 47.86650848388672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7202204465866089, + "step": 535, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9946643114089966 + }, + { + "episode": 8592, + "epoch": 0.15443793363770356, + "loss/policy_avg": 0.3695685565471649, + "lr": 2.897239263803681e-06, + "objective/entropy": 72.53046417236328, + "objective/kl": 15.155887603759766, + "objective/non_score_reward": -1.5155887603759766, + "objective/rlhf_reward": -1.6623550713062283, + "objective/scores": 1.1, + "policy/approxkl_avg": 99.77005004882812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.731490969657898, + "step": 536, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989895820617676 + }, + { + "episode": 8608, + "epoch": 0.1547255275550922, + "loss/policy_avg": 0.19564469158649445, + "lr": 2.89704754601227e-06, + "objective/entropy": 104.1103515625, + "objective/kl": 12.909381866455078, + "objective/non_score_reward": -1.290938377380371, + "objective/rlhf_reward": -4.7637532413005825, + "objective/scores": 0.1, + "policy/approxkl_avg": 49.42005920410156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4913232922554016, + "step": 537, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9961007833480835 + }, + { + "episode": 8624, + "epoch": 0.15501312147248086, + "loss/policy_avg": 0.0638544037938118, + "lr": 2.896855828220859e-06, + "objective/entropy": 120.96896362304688, + "objective/kl": 17.086078643798828, + "objective/non_score_reward": -1.708607792854309, + "objective/rlhf_reward": -6.434431171417236, + "objective/scores": 0.1, + "policy/approxkl_avg": 112.74113464355469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625002920627594, + "step": 538, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973227977752686 + }, + { + "episode": 8640, + "epoch": 0.1553007153898695, + "loss/policy_avg": 0.4011816382408142, + "lr": 2.896664110429448e-06, + "objective/entropy": 95.19891357421875, + "objective/kl": 5.403826713562012, + "objective/non_score_reward": -0.5403826236724854, + "objective/rlhf_reward": -4.161530494689941, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.818931579589844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.44589221477508545, + "step": 539, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999190330505371 + }, + { + "episode": 8656, + "epoch": 0.15558830930725814, + "loss/policy_avg": 0.006004150491207838, + "lr": 2.896472392638037e-06, + "objective/entropy": -98.64430236816406, + "objective/kl": 13.903255462646484, + "objective/non_score_reward": -1.3903255462646484, + "objective/rlhf_reward": -3.4385958335557323, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 96.46614074707031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5993703603744507, + "step": 540, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983282089233398 + }, + { + "episode": 8672, + "epoch": 0.1558759032246468, + "loss/policy_avg": 0.25113996863365173, + "lr": 2.896280674846626e-06, + "objective/entropy": 28.177112579345703, + "objective/kl": 10.27774715423584, + "objective/non_score_reward": -1.027774691581726, + "objective/rlhf_reward": -3.711098855733871, + "objective/scores": 0.1, + "policy/approxkl_avg": 15.50900650024414, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.38805845379829407, + "step": 541, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975563287734985 + }, + { + "episode": 8688, + "epoch": 0.15616349714203545, + "loss/policy_avg": 0.22983065247535706, + "lr": 2.896088957055215e-06, + "objective/entropy": -43.48884963989258, + "objective/kl": 16.519594192504883, + "objective/non_score_reward": -1.6519595384597778, + "objective/rlhf_reward": -4.207838369905948, + "objective/scores": 0.6, + "policy/approxkl_avg": 145.91189575195312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4935316741466522, + "step": 542, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9969571828842163 + }, + { + "episode": 8704, + "epoch": 0.1564510910594241, + "loss/policy_avg": 0.5034677386283875, + "lr": 2.895897239263804e-06, + "objective/entropy": -180.35511779785156, + "objective/kl": 11.565031051635742, + "objective/non_score_reward": -1.1565032005310059, + "objective/rlhf_reward": -0.22601289153099025, + "objective/scores": 1.1, + "policy/approxkl_avg": 45.29411315917969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6911122798919678, + "step": 543, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9962868690490723 + }, + { + "episode": 8720, + "epoch": 0.15673868497681273, + "loss/policy_avg": 0.02647099643945694, + "lr": 2.895705521472393e-06, + "objective/entropy": -299.92938232421875, + "objective/kl": 12.21584701538086, + "objective/non_score_reward": -1.2215845584869385, + "objective/rlhf_reward": -4.486337954550981, + "objective/scores": 0.1, + "policy/approxkl_avg": 101.26480102539062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5777586698532104, + "step": 544, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0000030994415283 + }, + { + "episode": 8736, + "epoch": 0.1570262788942014, + "loss/policy_avg": 0.0650666356086731, + "lr": 2.8955138036809817e-06, + "objective/entropy": -25.684341430664062, + "objective/kl": 12.18275260925293, + "objective/non_score_reward": -1.2182753086090088, + "objective/rlhf_reward": -0.47310084700584376, + "objective/scores": 1.1, + "policy/approxkl_avg": 73.90670776367188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5378080606460571, + "step": 545, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9970163106918335 + }, + { + "episode": 8752, + "epoch": 0.15731387281159004, + "loss/policy_avg": 0.3169589042663574, + "lr": 2.8953220858895705e-06, + "objective/entropy": 100.88613891601562, + "objective/kl": 7.750062942504883, + "objective/non_score_reward": -0.7750062942504883, + "objective/rlhf_reward": -5.100025177001953, + "objective/scores": -0.5, + "policy/approxkl_avg": 76.69898986816406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7943099737167358, + "step": 546, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0006103515625 + }, + { + "episode": 8768, + "epoch": 0.15760146672897868, + "loss/policy_avg": 0.15454688668251038, + "lr": 2.8951303680981593e-06, + "objective/entropy": -160.29241943359375, + "objective/kl": 12.760615348815918, + "objective/non_score_reward": -1.2760615348815918, + "objective/rlhf_reward": -4.704246199131012, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.389159202575684, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6418617963790894, + "step": 547, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000383138656616 + }, + { + "episode": 8784, + "epoch": 0.15788906064636732, + "loss/policy_avg": -0.0003520110622048378, + "lr": 2.8949386503067486e-06, + "objective/entropy": 29.00008773803711, + "objective/kl": 12.984089851379395, + "objective/non_score_reward": -1.2984089851379395, + "objective/rlhf_reward": -2.2699168368589606, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.11572265625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.479004442691803, + "step": 548, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004520416259766 + }, + { + "episode": 8800, + "epoch": 0.158176654563756, + "loss/policy_avg": -0.15695317089557648, + "lr": 2.8947469325153374e-06, + "objective/entropy": -84.50102233886719, + "objective/kl": 18.578369140625, + "objective/non_score_reward": -1.8578369617462158, + "objective/rlhf_reward": -5.698014610509077, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 86.01622009277344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4987892508506775, + "step": 549, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977936744689941 + }, + { + "episode": 8816, + "epoch": 0.15846424848114463, + "loss/policy_avg": -0.01714038848876953, + "lr": 2.894555214723926e-06, + "objective/entropy": -128.65835571289062, + "objective/kl": 5.4775614738464355, + "objective/non_score_reward": -0.5477561950683594, + "objective/rlhf_reward": -4.1910247802734375, + "objective/scores": -0.5, + "policy/approxkl_avg": 34.354248046875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5765225291252136, + "step": 550, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985828399658203 + }, + { + "episode": 8832, + "epoch": 0.15875184239853327, + "loss/policy_avg": 0.19021713733673096, + "lr": 2.8943634969325154e-06, + "objective/entropy": -61.83000946044922, + "objective/kl": 10.497882843017578, + "objective/non_score_reward": -1.0497883558273315, + "objective/rlhf_reward": -3.7991533041000363, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.45492553710938, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4981521964073181, + "step": 551, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998138666152954 + }, + { + "episode": 8848, + "epoch": 0.1590394363159219, + "loss/policy_avg": 0.06787601113319397, + "lr": 2.8941717791411042e-06, + "objective/entropy": 38.238014221191406, + "objective/kl": 15.474870681762695, + "objective/non_score_reward": -1.5474871397018433, + "objective/rlhf_reward": -8.189949035644531, + "objective/scores": -0.5, + "policy/approxkl_avg": 82.42859649658203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5804091095924377, + "step": 552, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993863105773926 + }, + { + "episode": 8864, + "epoch": 0.15932703023331057, + "loss/policy_avg": 0.30045267939567566, + "lr": 2.8939800613496935e-06, + "objective/entropy": -27.927902221679688, + "objective/kl": 10.419059753417969, + "objective/non_score_reward": -1.0419061183929443, + "objective/rlhf_reward": 0.23237573504447973, + "objective/scores": 1.1, + "policy/approxkl_avg": 209.59939575195312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6831308603286743, + "step": 553, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001933574676514 + }, + { + "episode": 8880, + "epoch": 0.1596146241506992, + "loss/policy_avg": 0.2085457146167755, + "lr": 2.8937883435582823e-06, + "objective/entropy": 35.96397018432617, + "objective/kl": 9.092164993286133, + "objective/non_score_reward": -0.9092164635658264, + "objective/rlhf_reward": -1.9035325507322947, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 39.68387222290039, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5189159512519836, + "step": 554, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000199794769287 + }, + { + "episode": 8896, + "epoch": 0.15990221806808785, + "loss/policy_avg": 0.3386860191822052, + "lr": 2.893596625766871e-06, + "objective/entropy": 12.668701171875, + "objective/kl": 14.887137413024902, + "objective/non_score_reward": -1.4887138605117798, + "objective/rlhf_reward": -5.554855218529701, + "objective/scores": 0.1, + "policy/approxkl_avg": 57.799781799316406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8298596143722534, + "step": 555, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001885414123535 + }, + { + "episode": 8912, + "epoch": 0.1601898119854765, + "loss/policy_avg": 0.2826036214828491, + "lr": 2.8934049079754603e-06, + "objective/entropy": 291.82574462890625, + "objective/kl": 11.674125671386719, + "objective/non_score_reward": -1.167412519454956, + "objective/rlhf_reward": -6.669650077819824, + "objective/scores": -0.5, + "policy/approxkl_avg": 129.4175567626953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7528454065322876, + "step": 556, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999050498008728 + }, + { + "episode": 8928, + "epoch": 0.16047740590286516, + "loss/policy_avg": 0.10318401455879211, + "lr": 2.893213190184049e-06, + "objective/entropy": 120.07017517089844, + "objective/kl": 4.665571212768555, + "objective/non_score_reward": -0.46655717492103577, + "objective/rlhf_reward": -1.4662286698818208, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.405817031860352, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5051665306091309, + "step": 557, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997723937034607 + }, + { + "episode": 8944, + "epoch": 0.1607649998202538, + "loss/policy_avg": -0.29497459530830383, + "lr": 2.893021472392638e-06, + "objective/entropy": 37.394432067871094, + "objective/kl": 9.016752243041992, + "objective/non_score_reward": -0.9016750454902649, + "objective/rlhf_reward": -1.6592889232205705, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 60.83589172363281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6633663773536682, + "step": 558, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.003632068634033 + }, + { + "episode": 8960, + "epoch": 0.16105259373764244, + "loss/policy_avg": 0.27766960859298706, + "lr": 2.892829754601227e-06, + "objective/entropy": 162.43531799316406, + "objective/kl": 14.33592414855957, + "objective/non_score_reward": -1.433592438697815, + "objective/rlhf_reward": -2.8106508597147197, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 65.74765014648438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7771644592285156, + "step": 559, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998669147491455 + }, + { + "episode": 8976, + "epoch": 0.1613401876550311, + "loss/policy_avg": 0.10618109256029129, + "lr": 2.892638036809816e-06, + "objective/entropy": -214.1174774169922, + "objective/kl": 7.891653537750244, + "objective/non_score_reward": -0.7891653776168823, + "objective/rlhf_reward": 1.2433385044336323, + "objective/scores": 1.1, + "policy/approxkl_avg": 46.754520416259766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6963553428649902, + "step": 560, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9995391368865967 + }, + { + "episode": 8992, + "epoch": 0.16162778157241975, + "loss/policy_avg": 0.16953638195991516, + "lr": 2.892446319018405e-06, + "objective/entropy": -72.53345489501953, + "objective/kl": 14.390440940856934, + "objective/non_score_reward": -1.4390441179275513, + "objective/rlhf_reward": -5.356176471710205, + "objective/scores": 0.1, + "policy/approxkl_avg": 157.7322235107422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4842403531074524, + "step": 561, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982311725616455 + }, + { + "episode": 9008, + "epoch": 0.1619153754898084, + "loss/policy_avg": -0.21345758438110352, + "lr": 2.892254601226994e-06, + "objective/entropy": 70.15734100341797, + "objective/kl": 8.587663650512695, + "objective/non_score_reward": -0.8587663769721985, + "objective/rlhf_reward": -5.435065269470215, + "objective/scores": -0.5, + "policy/approxkl_avg": 48.32007598876953, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6229357719421387, + "step": 562, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0034308433532715 + }, + { + "episode": 9024, + "epoch": 0.16220296940719703, + "loss/policy_avg": 0.628226637840271, + "lr": 2.892062883435583e-06, + "objective/entropy": 71.88837432861328, + "objective/kl": 13.11463737487793, + "objective/non_score_reward": -1.3114639520645142, + "objective/rlhf_reward": -7.245855331420898, + "objective/scores": -0.5, + "policy/approxkl_avg": 86.9578628540039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6590292453765869, + "step": 563, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999263048171997 + }, + { + "episode": 9040, + "epoch": 0.1624905633245857, + "loss/policy_avg": 0.4895268380641937, + "lr": 2.891871165644172e-06, + "objective/entropy": -137.79327392578125, + "objective/kl": 11.846264839172363, + "objective/non_score_reward": -1.1846263408660889, + "objective/rlhf_reward": -6.7385053634643555, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.822731018066406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6190036535263062, + "step": 564, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992012977600098 + }, + { + "episode": 9056, + "epoch": 0.16277815724197434, + "loss/policy_avg": 0.02343292161822319, + "lr": 2.891679447852761e-06, + "objective/entropy": -80.07128143310547, + "objective/kl": 8.20994758605957, + "objective/non_score_reward": -0.8209947943687439, + "objective/rlhf_reward": -2.8839792668819424, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.4821832180023193, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7127578258514404, + "step": 565, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.004193067550659 + }, + { + "episode": 9072, + "epoch": 0.16306575115936298, + "loss/policy_avg": 0.0619128942489624, + "lr": 2.89148773006135e-06, + "objective/entropy": -55.37126922607422, + "objective/kl": 17.618186950683594, + "objective/non_score_reward": -1.7618186473846436, + "objective/rlhf_reward": -4.12355539643881, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 126.11732482910156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5896855592727661, + "step": 566, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002530097961426 + }, + { + "episode": 9088, + "epoch": 0.16335334507675162, + "loss/policy_avg": 0.33223459124565125, + "lr": 2.8912960122699385e-06, + "objective/entropy": 44.957420349121094, + "objective/kl": 12.796667098999023, + "objective/non_score_reward": -1.2796669006347656, + "objective/rlhf_reward": -7.1186676025390625, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.001811027526855, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4202909469604492, + "step": 567, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002686977386475 + }, + { + "episode": 9104, + "epoch": 0.16364093899414028, + "loss/policy_avg": 0.46875202655792236, + "lr": 2.8911042944785277e-06, + "objective/entropy": 75.29147338867188, + "objective/kl": 14.741684913635254, + "objective/non_score_reward": -1.4741685390472412, + "objective/rlhf_reward": -4.071845050129006, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 119.49514770507812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4648001790046692, + "step": 568, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998049259185791 + }, + { + "episode": 9120, + "epoch": 0.16392853291152892, + "loss/policy_avg": 0.037746116518974304, + "lr": 2.8909125766871165e-06, + "objective/entropy": -114.35173797607422, + "objective/kl": 17.751426696777344, + "objective/non_score_reward": -1.7751425504684448, + "objective/rlhf_reward": -4.176851276994917, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 63.35493850708008, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7318030595779419, + "step": 569, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995402097702026 + }, + { + "episode": 9136, + "epoch": 0.16421612682891756, + "loss/policy_avg": 0.12556898593902588, + "lr": 2.8907208588957053e-06, + "objective/entropy": 275.97406005859375, + "objective/kl": 16.65966796875, + "objective/non_score_reward": -1.6659668684005737, + "objective/rlhf_reward": -3.740148504019949, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 182.56800842285156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7940037250518799, + "step": 570, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998062252998352 + }, + { + "episode": 9152, + "epoch": 0.1645037207463062, + "loss/policy_avg": 0.19311276078224182, + "lr": 2.8905291411042946e-06, + "objective/entropy": -13.802238464355469, + "objective/kl": 11.889385223388672, + "objective/non_score_reward": -1.1889386177062988, + "objective/rlhf_reward": -0.3557543516159054, + "objective/scores": 1.1, + "policy/approxkl_avg": 168.97225952148438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5607097148895264, + "step": 571, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9966776371002197 + }, + { + "episode": 9168, + "epoch": 0.16479131466369487, + "loss/policy_avg": -0.0551675409078598, + "lr": 2.8903374233128834e-06, + "objective/entropy": 199.9774627685547, + "objective/kl": 10.514884948730469, + "objective/non_score_reward": -1.0514883995056152, + "objective/rlhf_reward": -3.8059536576271054, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.9141960144043, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5954399108886719, + "step": 572, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0013375282287598 + }, + { + "episode": 9184, + "epoch": 0.1650789085810835, + "loss/policy_avg": 0.5291420817375183, + "lr": 2.890145705521472e-06, + "objective/entropy": -87.00729370117188, + "objective/kl": 17.926801681518555, + "objective/non_score_reward": -1.792680263519287, + "objective/rlhf_reward": -5.048014911190544, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 174.02410888671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6229202747344971, + "step": 573, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971938133239746 + }, + { + "episode": 9200, + "epoch": 0.16536650249847215, + "loss/policy_avg": 0.026936478912830353, + "lr": 2.8899539877300614e-06, + "objective/entropy": 16.224300384521484, + "objective/kl": 12.667734146118164, + "objective/non_score_reward": -1.2667735815048218, + "objective/rlhf_reward": -0.667094303667545, + "objective/scores": 1.1, + "policy/approxkl_avg": 61.12175369262695, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6178743243217468, + "step": 574, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982073307037354 + }, + { + "episode": 9216, + "epoch": 0.1656540964158608, + "loss/policy_avg": 0.3519830107688904, + "lr": 2.8897622699386502e-06, + "objective/entropy": -118.76617431640625, + "objective/kl": 12.919654846191406, + "objective/non_score_reward": -1.2919654846191406, + "objective/rlhf_reward": -3.5060024909382927, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 151.7967529296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4337049126625061, + "step": 575, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996576189994812 + }, + { + "episode": 9232, + "epoch": 0.16594169033324946, + "loss/policy_avg": 0.10243120789527893, + "lr": 2.8895705521472395e-06, + "objective/entropy": -136.20498657226562, + "objective/kl": 8.501260757446289, + "objective/non_score_reward": -0.8501260280609131, + "objective/rlhf_reward": -1.8442450006871969, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 44.17273712158203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6579757928848267, + "step": 576, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9993293285369873 + }, + { + "episode": 9248, + "epoch": 0.1662292842506381, + "loss/policy_avg": 0.09091467410326004, + "lr": 2.8893788343558283e-06, + "objective/entropy": -303.64678955078125, + "objective/kl": 8.604415893554688, + "objective/non_score_reward": -0.8604416847229004, + "objective/rlhf_reward": -1.6169378116455784, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 31.75139045715332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7412898540496826, + "step": 577, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9997444152832031 + }, + { + "episode": 9264, + "epoch": 0.16651687816802674, + "loss/policy_avg": 0.21000587940216064, + "lr": 2.889187116564417e-06, + "objective/entropy": 239.27052307128906, + "objective/kl": 10.989913940429688, + "objective/non_score_reward": -1.0989913940429688, + "objective/rlhf_reward": 0.004034423828125355, + "objective/scores": 1.1, + "policy/approxkl_avg": 27.15020179748535, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5851229429244995, + "step": 578, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004048347473145 + }, + { + "episode": 9280, + "epoch": 0.1668044720854154, + "loss/policy_avg": 0.22401443123817444, + "lr": 2.8889953987730063e-06, + "objective/entropy": 83.70755004882812, + "objective/kl": 11.797503471374512, + "objective/non_score_reward": -1.1797504425048828, + "objective/rlhf_reward": -1.7952826365244117, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 76.76632690429688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6816673278808594, + "step": 579, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980626106262207 + }, + { + "episode": 9296, + "epoch": 0.16709206600280405, + "loss/policy_avg": -0.2864213287830353, + "lr": 2.888803680981595e-06, + "objective/entropy": -0.46259307861328125, + "objective/kl": 8.927458763122559, + "objective/non_score_reward": -0.8927459716796875, + "objective/rlhf_reward": 0.8290162324905399, + "objective/scores": 1.1, + "policy/approxkl_avg": 12.566313743591309, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4375641345977783, + "step": 580, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000452995300293 + }, + { + "episode": 9312, + "epoch": 0.16737965992019269, + "loss/policy_avg": 0.34181928634643555, + "lr": 2.8886119631901844e-06, + "objective/entropy": -10.957565307617188, + "objective/kl": 11.041678428649902, + "objective/non_score_reward": -1.1041678190231323, + "objective/rlhf_reward": -4.016671425104141, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.29110717773438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5227146148681641, + "step": 581, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998535394668579 + }, + { + "episode": 9328, + "epoch": 0.16766725383758133, + "loss/policy_avg": 0.7440632581710815, + "lr": 2.888420245398773e-06, + "objective/entropy": -169.97946166992188, + "objective/kl": 5.892565727233887, + "objective/non_score_reward": -0.5892565846443176, + "objective/rlhf_reward": -1.9570263683795928, + "objective/scores": 0.1, + "policy/approxkl_avg": 42.19253158569336, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6428844928741455, + "step": 582, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971792697906494 + }, + { + "episode": 9344, + "epoch": 0.16795484775497, + "loss/policy_avg": 0.039995964616537094, + "lr": 2.888228527607362e-06, + "objective/entropy": 11.100467681884766, + "objective/kl": 8.740276336669922, + "objective/non_score_reward": -0.874027669429779, + "objective/rlhf_reward": -3.0961106330156323, + "objective/scores": 0.1, + "policy/approxkl_avg": 68.66104125976562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5335364937782288, + "step": 583, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975062608718872 + }, + { + "episode": 9360, + "epoch": 0.16824244167235863, + "loss/policy_avg": 0.18156462907791138, + "lr": 2.8880368098159512e-06, + "objective/entropy": 146.76071166992188, + "objective/kl": 13.685348510742188, + "objective/non_score_reward": -1.368534803390503, + "objective/rlhf_reward": -2.5504204376947612, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 198.45315551757812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6299135684967041, + "step": 584, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996903896331787 + }, + { + "episode": 9376, + "epoch": 0.16853003558974727, + "loss/policy_avg": 0.8130825757980347, + "lr": 2.88784509202454e-06, + "objective/entropy": 111.27020263671875, + "objective/kl": 10.760910034179688, + "objective/non_score_reward": -1.076090931892395, + "objective/rlhf_reward": -3.9043638467788693, + "objective/scores": 0.1, + "policy/approxkl_avg": 103.25462341308594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5371990203857422, + "step": 585, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976208209991455 + }, + { + "episode": 9392, + "epoch": 0.1688176295071359, + "loss/policy_avg": 0.39816397428512573, + "lr": 2.887653374233129e-06, + "objective/entropy": -99.50511932373047, + "objective/kl": 9.237605094909668, + "objective/non_score_reward": -0.9237604737281799, + "objective/rlhf_reward": -5.695041656494141, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.84246063232422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5522249341011047, + "step": 586, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989943504333496 + }, + { + "episode": 9408, + "epoch": 0.16910522342452458, + "loss/policy_avg": 0.16980989277362823, + "lr": 2.887461656441718e-06, + "objective/entropy": -199.70681762695312, + "objective/kl": 11.351678848266602, + "objective/non_score_reward": -1.1351678371429443, + "objective/rlhf_reward": -2.1406713783740994, + "objective/scores": 0.6, + "policy/approxkl_avg": 99.28809356689453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6828469038009644, + "step": 587, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9995384216308594 + }, + { + "episode": 9424, + "epoch": 0.16939281734191322, + "loss/policy_avg": 0.4977327585220337, + "lr": 2.887269938650307e-06, + "objective/entropy": 92.04341888427734, + "objective/kl": 7.678049087524414, + "objective/non_score_reward": -0.7678048610687256, + "objective/rlhf_reward": -2.671219593286514, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.31591796875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6588965654373169, + "step": 588, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978811740875244 + }, + { + "episode": 9440, + "epoch": 0.16968041125930186, + "loss/policy_avg": 0.7905654311180115, + "lr": 2.887078220858896e-06, + "objective/entropy": 132.01498413085938, + "objective/kl": 14.539533615112305, + "objective/non_score_reward": -1.453953504562378, + "objective/rlhf_reward": -5.4158137202262875, + "objective/scores": 0.1, + "policy/approxkl_avg": 65.95307159423828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46912258863449097, + "step": 589, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991271495819092 + }, + { + "episode": 9456, + "epoch": 0.1699680051766905, + "loss/policy_avg": -0.066254623234272, + "lr": 2.8868865030674845e-06, + "objective/entropy": 71.34278869628906, + "objective/kl": 14.032512664794922, + "objective/non_score_reward": -1.4032511711120605, + "objective/rlhf_reward": -7.613004684448242, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.8614501953125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6078362464904785, + "step": 590, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000101089477539 + }, + { + "episode": 9472, + "epoch": 0.17025559909407917, + "loss/policy_avg": 0.5789448618888855, + "lr": 2.8866947852760737e-06, + "objective/entropy": 48.365203857421875, + "objective/kl": 10.217412948608398, + "objective/non_score_reward": -1.0217413902282715, + "objective/rlhf_reward": -3.6869654193520542, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.687175750732422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48520535230636597, + "step": 591, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006895065307617 + }, + { + "episode": 9488, + "epoch": 0.1705431930114678, + "loss/policy_avg": 0.4344612956047058, + "lr": 2.8865030674846625e-06, + "objective/entropy": 252.8432159423828, + "objective/kl": 17.551427841186523, + "objective/non_score_reward": -1.7551428079605103, + "objective/rlhf_reward": -9.020570755004883, + "objective/scores": -0.5, + "policy/approxkl_avg": 154.91946411132812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7674171924591064, + "step": 592, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977104663848877 + }, + { + "episode": 9504, + "epoch": 0.17083078692885645, + "loss/policy_avg": 0.45362526178359985, + "lr": 2.8863113496932513e-06, + "objective/entropy": 19.733963012695312, + "objective/kl": 12.605318069458008, + "objective/non_score_reward": -1.2605319023132324, + "objective/rlhf_reward": -4.642127639055252, + "objective/scores": 0.1, + "policy/approxkl_avg": 236.24461364746094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7392984628677368, + "step": 593, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99709153175354 + }, + { + "episode": 9520, + "epoch": 0.1711183808462451, + "loss/policy_avg": 0.05706373229622841, + "lr": 2.8861196319018406e-06, + "objective/entropy": -80.88824462890625, + "objective/kl": 10.57326889038086, + "objective/non_score_reward": -1.0573269128799438, + "objective/rlhf_reward": 0.1706922888755802, + "objective/scores": 1.1, + "policy/approxkl_avg": 220.6209716796875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6385011672973633, + "step": 594, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000190258026123 + }, + { + "episode": 9536, + "epoch": 0.17140597476363376, + "loss/policy_avg": 0.4798924922943115, + "lr": 2.8859279141104294e-06, + "objective/entropy": 96.04144287109375, + "objective/kl": 13.669404029846191, + "objective/non_score_reward": -1.3669404983520508, + "objective/rlhf_reward": -1.0677618443965908, + "objective/scores": 1.1, + "policy/approxkl_avg": 79.78167724609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7598651647567749, + "step": 595, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998369216918945 + }, + { + "episode": 9552, + "epoch": 0.1716935686810224, + "loss/policy_avg": 0.6355167627334595, + "lr": 2.8857361963190186e-06, + "objective/entropy": 64.89462280273438, + "objective/kl": 13.15363883972168, + "objective/non_score_reward": -1.315363883972168, + "objective/rlhf_reward": -4.861455595493316, + "objective/scores": 0.1, + "policy/approxkl_avg": 57.076480865478516, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.47427719831466675, + "step": 596, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995226860046387 + }, + { + "episode": 9568, + "epoch": 0.17198116259841104, + "loss/policy_avg": 0.10829915851354599, + "lr": 2.8855444785276074e-06, + "objective/entropy": 79.2706298828125, + "objective/kl": 16.379791259765625, + "objective/non_score_reward": -1.63797926902771, + "objective/rlhf_reward": -6.151916986703872, + "objective/scores": 0.1, + "policy/approxkl_avg": 103.64852142333984, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6097987294197083, + "step": 597, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987919330596924 + }, + { + "episode": 9584, + "epoch": 0.1722687565157997, + "loss/policy_avg": 0.13417291641235352, + "lr": 2.8853527607361962e-06, + "objective/entropy": -54.99534606933594, + "objective/kl": 13.775299072265625, + "objective/non_score_reward": -1.3775299787521362, + "objective/rlhf_reward": -3.110119885206222, + "objective/scores": 0.6, + "policy/approxkl_avg": 41.135406494140625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6685090661048889, + "step": 598, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994128942489624 + }, + { + "episode": 9600, + "epoch": 0.17255635043318834, + "loss/policy_avg": 0.2789982557296753, + "lr": 2.8851610429447855e-06, + "objective/entropy": 6.352031707763672, + "objective/kl": 15.038141250610352, + "objective/non_score_reward": -1.5038139820098877, + "objective/rlhf_reward": -3.61525604724884, + "objective/scores": 0.6, + "policy/approxkl_avg": 80.10298919677734, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47064271569252014, + "step": 599, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999215602874756 + }, + { + "episode": 9616, + "epoch": 0.17284394435057698, + "loss/policy_avg": 0.17868322134017944, + "lr": 2.8849693251533743e-06, + "objective/entropy": 100.41844940185547, + "objective/kl": 14.95844841003418, + "objective/non_score_reward": -1.495845079421997, + "objective/rlhf_reward": -3.8606740257897716, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 51.73733139038086, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8421609997749329, + "step": 600, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001194953918457 + }, + { + "episode": 9632, + "epoch": 0.17313153826796562, + "loss/policy_avg": 0.6214461326599121, + "lr": 2.884777607361963e-06, + "objective/entropy": 231.32444763183594, + "objective/kl": 14.899240493774414, + "objective/non_score_reward": -1.4899241924285889, + "objective/rlhf_reward": -3.035977606416914, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 157.85556030273438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5734395980834961, + "step": 601, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9957280158996582 + }, + { + "episode": 9648, + "epoch": 0.1734191321853543, + "loss/policy_avg": 0.16177789866924286, + "lr": 2.8845858895705523e-06, + "objective/entropy": -75.67822265625, + "objective/kl": 13.02596664428711, + "objective/non_score_reward": -1.3025965690612793, + "objective/rlhf_reward": -0.8103865742683407, + "objective/scores": 1.1, + "policy/approxkl_avg": 51.60423278808594, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7318694591522217, + "step": 602, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9996418952941895 + }, + { + "episode": 9664, + "epoch": 0.17370672610274293, + "loss/policy_avg": 0.22588306665420532, + "lr": 2.884394171779141e-06, + "objective/entropy": 132.67103576660156, + "objective/kl": 13.433195114135742, + "objective/non_score_reward": -1.3433196544647217, + "objective/rlhf_reward": -4.973278379440307, + "objective/scores": 0.1, + "policy/approxkl_avg": 99.59487915039062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6894341707229614, + "step": 603, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990489482879639 + }, + { + "episode": 9680, + "epoch": 0.17399432002013157, + "loss/policy_avg": 0.22847160696983337, + "lr": 2.8842024539877304e-06, + "objective/entropy": 122.22633361816406, + "objective/kl": 20.479631423950195, + "objective/non_score_reward": -2.0479629039764404, + "objective/rlhf_reward": -3.7918519139289852, + "objective/scores": 1.1, + "policy/approxkl_avg": 103.07848358154297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8397064805030823, + "step": 604, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971545934677124 + }, + { + "episode": 9696, + "epoch": 0.1742819139375202, + "loss/policy_avg": 0.397056519985199, + "lr": 2.884010736196319e-06, + "objective/entropy": -62.73558044433594, + "objective/kl": 10.932550430297852, + "objective/non_score_reward": -1.0932550430297852, + "objective/rlhf_reward": -3.9730204105377194, + "objective/scores": 0.1, + "policy/approxkl_avg": 28.36017417907715, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5593264102935791, + "step": 605, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984571933746338 + }, + { + "episode": 9712, + "epoch": 0.17456950785490888, + "loss/policy_avg": 0.28502804040908813, + "lr": 2.883819018404908e-06, + "objective/entropy": 72.55328369140625, + "objective/kl": 12.644740104675293, + "objective/non_score_reward": -1.2644741535186768, + "objective/rlhf_reward": -7.057896614074707, + "objective/scores": -0.5, + "policy/approxkl_avg": 103.56513214111328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49720489978790283, + "step": 606, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984281063079834 + }, + { + "episode": 9728, + "epoch": 0.17485710177229752, + "loss/policy_avg": 0.49002373218536377, + "lr": 2.8836273006134972e-06, + "objective/entropy": -66.95445251464844, + "objective/kl": 12.276761054992676, + "objective/non_score_reward": -1.2276760339736938, + "objective/rlhf_reward": -6.910704135894775, + "objective/scores": -0.5, + "policy/approxkl_avg": 128.71206665039062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5521280765533447, + "step": 607, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998512625694275 + }, + { + "episode": 9744, + "epoch": 0.17514469568968616, + "loss/policy_avg": 0.15580351650714874, + "lr": 2.883435582822086e-06, + "objective/entropy": -14.935211181640625, + "objective/kl": 12.201488494873047, + "objective/non_score_reward": -1.2201488018035889, + "objective/rlhf_reward": -4.480595326423645, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.03843688964844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6915831565856934, + "step": 608, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9991041421890259 + }, + { + "episode": 9760, + "epoch": 0.1754322896070748, + "loss/policy_avg": -0.07565896213054657, + "lr": 2.883243865030675e-06, + "objective/entropy": -222.00851440429688, + "objective/kl": 13.098295211791992, + "objective/non_score_reward": -1.3098294734954834, + "objective/rlhf_reward": -3.4144895031777134, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 23.588008880615234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4547193646430969, + "step": 609, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0002942085266113 + }, + { + "episode": 9776, + "epoch": 0.17571988352446347, + "loss/policy_avg": -0.02608119696378708, + "lr": 2.883052147239264e-06, + "objective/entropy": -214.93325805664062, + "objective/kl": 13.121513366699219, + "objective/non_score_reward": -1.3121511936187744, + "objective/rlhf_reward": -2.324885469616625, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.12300109863281, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7193573713302612, + "step": 610, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001455307006836 + }, + { + "episode": 9792, + "epoch": 0.1760074774418521, + "loss/policy_avg": 0.05792073905467987, + "lr": 2.882860429447853e-06, + "objective/entropy": -38.28462600708008, + "objective/kl": 6.414361000061035, + "objective/non_score_reward": -0.6414362192153931, + "objective/rlhf_reward": -4.565744876861572, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.29863929748535, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7934014797210693, + "step": 611, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998834133148193 + }, + { + "episode": 9808, + "epoch": 0.17629507135924075, + "loss/policy_avg": 0.34763196110725403, + "lr": 2.8826687116564417e-06, + "objective/entropy": -187.356201171875, + "objective/kl": 14.694540977478027, + "objective/non_score_reward": -1.4694541692733765, + "objective/rlhf_reward": -7.877816200256348, + "objective/scores": -0.5, + "policy/approxkl_avg": 164.05078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48596644401550293, + "step": 612, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99922776222229 + }, + { + "episode": 9824, + "epoch": 0.17658266527662939, + "loss/policy_avg": 0.034425608813762665, + "lr": 2.8824769938650305e-06, + "objective/entropy": 30.521377563476562, + "objective/kl": 14.546671867370605, + "objective/non_score_reward": -1.4546672105789185, + "objective/rlhf_reward": -5.4186688423156735, + "objective/scores": 0.1, + "policy/approxkl_avg": 92.84369659423828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6131436824798584, + "step": 613, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000127077102661 + }, + { + "episode": 9840, + "epoch": 0.17687025919401805, + "loss/policy_avg": 0.6788812875747681, + "lr": 2.8822852760736197e-06, + "objective/entropy": -29.50526237487793, + "objective/kl": 11.87314224243164, + "objective/non_score_reward": -1.1873142719268799, + "objective/rlhf_reward": -1.8255377455961435, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 69.99320220947266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484935283660889, + "step": 614, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9981306791305542 + }, + { + "episode": 9856, + "epoch": 0.1771578531114067, + "loss/policy_avg": 0.018577001988887787, + "lr": 2.8820935582822085e-06, + "objective/entropy": 91.57209777832031, + "objective/kl": 15.141888618469238, + "objective/non_score_reward": -1.5141887664794922, + "objective/rlhf_reward": -8.056755065917969, + "objective/scores": -0.5, + "policy/approxkl_avg": 128.7979736328125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5703952312469482, + "step": 615, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988903999328613 + }, + { + "episode": 9872, + "epoch": 0.17744544702879533, + "loss/policy_avg": -0.041117969900369644, + "lr": 2.8819018404907974e-06, + "objective/entropy": 137.1066436767578, + "objective/kl": 6.767946243286133, + "objective/non_score_reward": -0.6767945885658264, + "objective/rlhf_reward": -2.30717841386795, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.107406616210938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6602547764778137, + "step": 616, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993090629577637 + }, + { + "episode": 9888, + "epoch": 0.177733040946184, + "loss/policy_avg": 0.3238484859466553, + "lr": 2.8817101226993866e-06, + "objective/entropy": -231.63035583496094, + "objective/kl": 9.579354286193848, + "objective/non_score_reward": -0.9579353928565979, + "objective/rlhf_reward": -5.8317413330078125, + "objective/scores": -0.5, + "policy/approxkl_avg": 95.74900817871094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.772737979888916, + "step": 617, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9988899230957031 + }, + { + "episode": 9904, + "epoch": 0.17802063486357264, + "loss/policy_avg": -0.08733166754245758, + "lr": 2.8815184049079754e-06, + "objective/entropy": 94.04405975341797, + "objective/kl": 10.622529983520508, + "objective/non_score_reward": -1.0622529983520508, + "objective/rlhf_reward": -1.849012157320976, + "objective/scores": 0.6, + "policy/approxkl_avg": 161.30787658691406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6008134484291077, + "step": 618, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983432292938232 + }, + { + "episode": 9920, + "epoch": 0.17830822878096128, + "loss/policy_avg": 0.22342449426651, + "lr": 2.8813266871165646e-06, + "objective/entropy": 23.68462562561035, + "objective/kl": 11.187488555908203, + "objective/non_score_reward": -1.1187489032745361, + "objective/rlhf_reward": -6.4749956130981445, + "objective/scores": -0.5, + "policy/approxkl_avg": 71.44534301757812, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.44539380073547363, + "step": 619, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987552165985107 + }, + { + "episode": 9936, + "epoch": 0.17859582269834992, + "loss/policy_avg": 0.22347092628479004, + "lr": 2.8811349693251534e-06, + "objective/entropy": 175.77520751953125, + "objective/kl": 17.442081451416016, + "objective/non_score_reward": -1.7442083358764648, + "objective/rlhf_reward": -8.97683334350586, + "objective/scores": -0.5, + "policy/approxkl_avg": 161.70484924316406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4676833748817444, + "step": 620, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993557929992676 + }, + { + "episode": 9952, + "epoch": 0.1788834166157386, + "loss/policy_avg": 0.09556691348552704, + "lr": 2.8809432515337422e-06, + "objective/entropy": 166.47305297851562, + "objective/kl": 14.810133934020996, + "objective/non_score_reward": -1.4810134172439575, + "objective/rlhf_reward": -5.524053907394409, + "objective/scores": 0.1, + "policy/approxkl_avg": 101.40066528320312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5872761011123657, + "step": 621, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000371217727661 + }, + { + "episode": 9968, + "epoch": 0.17917101053312723, + "loss/policy_avg": 0.30460673570632935, + "lr": 2.8807515337423315e-06, + "objective/entropy": -225.1236572265625, + "objective/kl": 13.543222427368164, + "objective/non_score_reward": -1.3543224334716797, + "objective/rlhf_reward": -1.0172893762588497, + "objective/scores": 1.1, + "policy/approxkl_avg": 82.92485046386719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5670063495635986, + "step": 622, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9980478286743164 + }, + { + "episode": 9984, + "epoch": 0.17945860445051587, + "loss/policy_avg": 0.2849801778793335, + "lr": 2.8805598159509203e-06, + "objective/entropy": 96.21595764160156, + "objective/kl": 12.93740463256836, + "objective/non_score_reward": -1.2937402725219727, + "objective/rlhf_reward": -4.774961328506469, + "objective/scores": 0.1, + "policy/approxkl_avg": 142.59600830078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47951966524124146, + "step": 623, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973121881484985 + }, + { + "episode": 10000, + "epoch": 0.1797461983679045, + "loss/policy_avg": 0.25585901737213135, + "lr": 2.880368098159509e-06, + "objective/entropy": 194.88865661621094, + "objective/kl": 6.153909206390381, + "objective/non_score_reward": -0.6153908967971802, + "objective/rlhf_reward": 0.4621554904270444, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 31.100088119506836, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7605673670768738, + "step": 624, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002366065979004 + }, + { + "episode": 10016, + "epoch": 0.18003379228529318, + "loss/policy_avg": 0.24915774166584015, + "lr": 2.8801763803680983e-06, + "objective/entropy": 120.67022705078125, + "objective/kl": 8.496511459350586, + "objective/non_score_reward": -0.8496510982513428, + "objective/rlhf_reward": -5.398604393005371, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.57280731201172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6938654184341431, + "step": 625, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988131523132324 + }, + { + "episode": 10032, + "epoch": 0.18032138620268182, + "loss/policy_avg": 0.4236345887184143, + "lr": 2.879984662576687e-06, + "objective/entropy": -12.667598724365234, + "objective/kl": 11.722315788269043, + "objective/non_score_reward": -1.1722315549850464, + "objective/rlhf_reward": -4.2889264062047, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.98892211914062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6948565244674683, + "step": 626, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976496696472168 + }, + { + "episode": 10048, + "epoch": 0.18060898012007046, + "loss/policy_avg": 0.16770553588867188, + "lr": 2.8797929447852764e-06, + "objective/entropy": 57.50523376464844, + "objective/kl": 17.840484619140625, + "objective/non_score_reward": -1.7840485572814941, + "objective/rlhf_reward": -9.136194229125977, + "objective/scores": -0.5, + "policy/approxkl_avg": 120.76194763183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6823735237121582, + "step": 627, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997957944869995 + }, + { + "episode": 10064, + "epoch": 0.1808965740374591, + "loss/policy_avg": 0.4961099624633789, + "lr": 2.879601226993865e-06, + "objective/entropy": -25.265766143798828, + "objective/kl": 10.27054214477539, + "objective/non_score_reward": -1.0270541906356812, + "objective/rlhf_reward": -1.1844979419719903, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 57.600494384765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6599663496017456, + "step": 628, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999619722366333 + }, + { + "episode": 10080, + "epoch": 0.18118416795484776, + "loss/policy_avg": 0.562252938747406, + "lr": 2.879409509202454e-06, + "objective/entropy": 54.366180419921875, + "objective/kl": 13.384576797485352, + "objective/non_score_reward": -1.3384575843811035, + "objective/rlhf_reward": -0.9538304716348645, + "objective/scores": 1.1, + "policy/approxkl_avg": 58.41983413696289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6028238534927368, + "step": 629, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995472431182861 + }, + { + "episode": 10096, + "epoch": 0.1814717618722364, + "loss/policy_avg": 0.9494496583938599, + "lr": 2.8792177914110432e-06, + "objective/entropy": 102.92730712890625, + "objective/kl": 11.970917701721191, + "objective/non_score_reward": -1.1970919370651245, + "objective/rlhf_reward": -3.055034414927164, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 71.0523910522461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5434826612472534, + "step": 630, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000587224960327 + }, + { + "episode": 10112, + "epoch": 0.18175935578962504, + "loss/policy_avg": 0.06726402044296265, + "lr": 2.879026073619632e-06, + "objective/entropy": -49.708221435546875, + "objective/kl": 11.118875503540039, + "objective/non_score_reward": -1.1118874549865723, + "objective/rlhf_reward": -1.5238312228929727, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 84.36143493652344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7672351002693176, + "step": 631, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989380836486816 + }, + { + "episode": 10128, + "epoch": 0.18204694970701368, + "loss/policy_avg": -0.2285957634449005, + "lr": 2.8788343558282213e-06, + "objective/entropy": 167.11105346679688, + "objective/kl": 11.83863353729248, + "objective/non_score_reward": -1.1838631629943848, + "objective/rlhf_reward": -3.1791934733658582, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 22.718273162841797, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.45381465554237366, + "step": 632, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0061585903167725 + }, + { + "episode": 10144, + "epoch": 0.18233454362440235, + "loss/policy_avg": 1.1582974195480347, + "lr": 2.87864263803681e-06, + "objective/entropy": -151.89218139648438, + "objective/kl": 7.8154096603393555, + "objective/non_score_reward": -0.7815409898757935, + "objective/rlhf_reward": -2.7261639595031735, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.87982177734375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6609333753585815, + "step": 633, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000443458557129 + }, + { + "episode": 10160, + "epoch": 0.182622137541791, + "loss/policy_avg": -0.023708324879407883, + "lr": 2.878450920245399e-06, + "objective/entropy": 37.85424041748047, + "objective/kl": 12.195389747619629, + "objective/non_score_reward": -1.2195390462875366, + "objective/rlhf_reward": -1.9544372900736064, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 84.37319946289062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.563186764717102, + "step": 634, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0016093254089355 + }, + { + "episode": 10176, + "epoch": 0.18290973145917963, + "loss/policy_avg": 0.22720938920974731, + "lr": 2.8782592024539877e-06, + "objective/entropy": 70.03634643554688, + "objective/kl": 14.060079574584961, + "objective/non_score_reward": -1.406008005142212, + "objective/rlhf_reward": -7.624032020568848, + "objective/scores": -0.5, + "policy/approxkl_avg": 70.89201354980469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.449720174074173, + "step": 635, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980741739273071 + }, + { + "episode": 10192, + "epoch": 0.1831973253765683, + "loss/policy_avg": 0.5389465689659119, + "lr": 2.8780674846625765e-06, + "objective/entropy": -199.75421142578125, + "objective/kl": 15.233451843261719, + "objective/non_score_reward": -1.5233452320098877, + "objective/rlhf_reward": -8.093381881713867, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.79878234863281, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.48563235998153687, + "step": 636, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998356819152832 + }, + { + "episode": 10208, + "epoch": 0.18348491929395694, + "loss/policy_avg": 0.37082988023757935, + "lr": 2.8778757668711657e-06, + "objective/entropy": 29.62700653076172, + "objective/kl": 10.318947792053223, + "objective/non_score_reward": -1.0318948030471802, + "objective/rlhf_reward": -1.2038601979028907, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 19.83915138244629, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5535821914672852, + "step": 637, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0009536743164062 + }, + { + "episode": 10224, + "epoch": 0.18377251321134558, + "loss/policy_avg": 0.01469859853386879, + "lr": 2.8776840490797546e-06, + "objective/entropy": 13.477630615234375, + "objective/kl": 14.343404769897461, + "objective/non_score_reward": -1.4343405961990356, + "objective/rlhf_reward": -5.337362265586853, + "objective/scores": 0.1, + "policy/approxkl_avg": 108.1371078491211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4337630867958069, + "step": 638, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978924989700317 + }, + { + "episode": 10240, + "epoch": 0.18406010712873422, + "loss/policy_avg": 0.3221081495285034, + "lr": 2.8774923312883434e-06, + "objective/entropy": 282.66595458984375, + "objective/kl": 11.63412857055664, + "objective/non_score_reward": -1.1634126901626587, + "objective/rlhf_reward": -0.2536508500576016, + "objective/scores": 1.1, + "policy/approxkl_avg": 111.42243957519531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7095741033554077, + "step": 639, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9970577955245972 + }, + { + "episode": 10256, + "epoch": 0.18434770104612289, + "loss/policy_avg": 0.08750347793102264, + "lr": 2.8773006134969326e-06, + "objective/entropy": -36.40140914916992, + "objective/kl": 12.267041206359863, + "objective/non_score_reward": -1.2267042398452759, + "objective/rlhf_reward": -6.9068169593811035, + "objective/scores": -0.5, + "policy/approxkl_avg": 62.82714080810547, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5733236074447632, + "step": 640, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000546932220459 + }, + { + "episode": 10272, + "epoch": 0.18463529496351153, + "loss/policy_avg": 0.3036690652370453, + "lr": 2.8771088957055214e-06, + "objective/entropy": 49.865753173828125, + "objective/kl": 11.792703628540039, + "objective/non_score_reward": -1.1792702674865723, + "objective/rlhf_reward": -4.317080801725387, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.753089904785156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.695482611656189, + "step": 641, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989582300186157 + }, + { + "episode": 10288, + "epoch": 0.18492288888090017, + "loss/policy_avg": 0.7012618780136108, + "lr": 2.8769171779141106e-06, + "objective/entropy": 92.8203125, + "objective/kl": 10.009756088256836, + "objective/non_score_reward": -1.0009756088256836, + "objective/rlhf_reward": -6.003902435302734, + "objective/scores": -0.5, + "policy/approxkl_avg": 160.71142578125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.45697641372680664, + "step": 642, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986211061477661 + }, + { + "episode": 10304, + "epoch": 0.1852104827982888, + "loss/policy_avg": 0.18139493465423584, + "lr": 2.8767254601226994e-06, + "objective/entropy": 35.44036865234375, + "objective/kl": 11.35489273071289, + "objective/non_score_reward": -1.1354892253875732, + "objective/rlhf_reward": -6.541956901550293, + "objective/scores": -0.5, + "policy/approxkl_avg": 73.70870971679688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6653156876564026, + "step": 643, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983882904052734 + }, + { + "episode": 10320, + "epoch": 0.18549807671567747, + "loss/policy_avg": 0.21259146928787231, + "lr": 2.8765337423312883e-06, + "objective/entropy": 65.79669189453125, + "objective/kl": 5.1166887283325195, + "objective/non_score_reward": -0.5116689205169678, + "objective/rlhf_reward": -1.6466757565736772, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.101611137390137, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44910210371017456, + "step": 644, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9965652227401733 + }, + { + "episode": 10336, + "epoch": 0.1857856706330661, + "loss/policy_avg": 1.021752953529358, + "lr": 2.8763420245398775e-06, + "objective/entropy": -34.64234161376953, + "objective/kl": 15.709405899047852, + "objective/non_score_reward": -1.5709404945373535, + "objective/rlhf_reward": -5.883762127161026, + "objective/scores": 0.1, + "policy/approxkl_avg": 142.4956817626953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5723711252212524, + "step": 645, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976788759231567 + }, + { + "episode": 10352, + "epoch": 0.18607326455045475, + "loss/policy_avg": 0.24777646362781525, + "lr": 2.8761503067484663e-06, + "objective/entropy": 77.03242492675781, + "objective/kl": 15.705501556396484, + "objective/non_score_reward": -1.5705503225326538, + "objective/rlhf_reward": -5.882201319932937, + "objective/scores": 0.1, + "policy/approxkl_avg": 104.79753875732422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619193434715271, + "step": 646, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973654747009277 + }, + { + "episode": 10368, + "epoch": 0.1863608584678434, + "loss/policy_avg": 0.05733855068683624, + "lr": 2.8759585889570555e-06, + "objective/entropy": 105.6750717163086, + "objective/kl": 13.765717506408691, + "objective/non_score_reward": -1.3765718936920166, + "objective/rlhf_reward": -5.1062874555587765, + "objective/scores": 0.1, + "policy/approxkl_avg": 96.42398834228516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.2780163288116455, + "step": 647, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983540773391724 + }, + { + "episode": 10384, + "epoch": 0.18664845238523206, + "loss/policy_avg": 0.4622950553894043, + "lr": 2.8757668711656443e-06, + "objective/entropy": -98.47879028320312, + "objective/kl": 7.556210517883301, + "objective/non_score_reward": -0.755621075630188, + "objective/rlhf_reward": -0.09876528823492192, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 68.89460754394531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7213411927223206, + "step": 648, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9966726303100586 + }, + { + "episode": 10400, + "epoch": 0.1869360463026207, + "loss/policy_avg": 0.8236960172653198, + "lr": 2.875575153374233e-06, + "objective/entropy": 18.49713897705078, + "objective/kl": 15.854425430297852, + "objective/non_score_reward": -1.5854425430297852, + "objective/rlhf_reward": -8.34177017211914, + "objective/scores": -0.5, + "policy/approxkl_avg": 67.23236083984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8861958980560303, + "step": 649, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0006723403930664 + }, + { + "episode": 10416, + "epoch": 0.18722364022000934, + "loss/policy_avg": -0.11324408650398254, + "lr": 2.8753834355828224e-06, + "objective/entropy": -102.02124786376953, + "objective/kl": 5.1298675537109375, + "objective/non_score_reward": -0.5129867792129517, + "objective/rlhf_reward": 0.8717719123351846, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 28.45832633972168, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4376002848148346, + "step": 650, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0088820457458496 + }, + { + "episode": 10432, + "epoch": 0.18751123413739798, + "loss/policy_avg": 0.2450694590806961, + "lr": 2.875191717791411e-06, + "objective/entropy": 33.388450622558594, + "objective/kl": 13.52882194519043, + "objective/non_score_reward": -1.3528821468353271, + "objective/rlhf_reward": -2.4878097816717357, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 11.315901756286621, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.41914981603622437, + "step": 651, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0015416145324707 + }, + { + "episode": 10448, + "epoch": 0.18779882805478665, + "loss/policy_avg": 0.009298861026763916, + "lr": 2.875e-06, + "objective/entropy": -16.950359344482422, + "objective/kl": 9.792383193969727, + "objective/non_score_reward": -0.9792382717132568, + "objective/rlhf_reward": 0.4830468982458118, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.477102279663086, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6265867948532104, + "step": 652, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000293016433716 + }, + { + "episode": 10464, + "epoch": 0.1880864219721753, + "loss/policy_avg": 0.4039332866668701, + "lr": 2.8748082822085892e-06, + "objective/entropy": 66.02500915527344, + "objective/kl": 8.322999954223633, + "objective/non_score_reward": -0.8322999477386475, + "objective/rlhf_reward": -5.32919979095459, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.993392944335938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6458040475845337, + "step": 653, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998199939727783 + }, + { + "episode": 10480, + "epoch": 0.18837401588956393, + "loss/policy_avg": 0.15707165002822876, + "lr": 2.874616564417178e-06, + "objective/entropy": 171.90771484375, + "objective/kl": 11.948579788208008, + "objective/non_score_reward": -1.1948580741882324, + "objective/rlhf_reward": -6.77943229675293, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.851991653442383, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6504145860671997, + "step": 654, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997503757476807 + }, + { + "episode": 10496, + "epoch": 0.1886616098069526, + "loss/policy_avg": 0.07920925319194794, + "lr": 2.8744248466257673e-06, + "objective/entropy": 61.45829391479492, + "objective/kl": 12.758872985839844, + "objective/non_score_reward": -1.275887370109558, + "objective/rlhf_reward": -3.1561381918954208, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 101.437744140625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8374680876731873, + "step": 655, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0019378662109375 + }, + { + "episode": 10512, + "epoch": 0.18894920372434124, + "loss/policy_avg": 0.555253267288208, + "lr": 2.8742331288343557e-06, + "objective/entropy": -175.50808715820312, + "objective/kl": 14.069064140319824, + "objective/non_score_reward": -1.4069066047668457, + "objective/rlhf_reward": -5.227626121044159, + "objective/scores": 0.1, + "policy/approxkl_avg": 81.80171203613281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.590570330619812, + "step": 656, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998897910118103 + }, + { + "episode": 10528, + "epoch": 0.18923679764172988, + "loss/policy_avg": 0.11733473837375641, + "lr": 2.874041411042945e-06, + "objective/entropy": 114.00068664550781, + "objective/kl": 18.550704956054688, + "objective/non_score_reward": -1.8550705909729004, + "objective/rlhf_reward": -7.020282661914825, + "objective/scores": 0.1, + "policy/approxkl_avg": 203.83114624023438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6477522850036621, + "step": 657, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994854927062988 + }, + { + "episode": 10544, + "epoch": 0.18952439155911852, + "loss/policy_avg": 0.15716442465782166, + "lr": 2.8738496932515337e-06, + "objective/entropy": 280.2091064453125, + "objective/kl": 15.730361938476562, + "objective/non_score_reward": -1.5730363130569458, + "objective/rlhf_reward": -8.292144775390625, + "objective/scores": -0.5, + "policy/approxkl_avg": 69.69169616699219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 1.0129367113113403, + "step": 658, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002072334289551 + }, + { + "episode": 10560, + "epoch": 0.18981198547650718, + "loss/policy_avg": 0.6117931604385376, + "lr": 2.8736579754601225e-06, + "objective/entropy": -17.964401245117188, + "objective/kl": 12.338603973388672, + "objective/non_score_reward": -1.2338604927062988, + "objective/rlhf_reward": -6.935441970825195, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.43081283569336, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6273235082626343, + "step": 659, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0019495487213135 + }, + { + "episode": 10576, + "epoch": 0.19009957939389582, + "loss/policy_avg": 0.1620130091905594, + "lr": 2.8734662576687117e-06, + "objective/entropy": -31.008544921875, + "objective/kl": 8.896329879760742, + "objective/non_score_reward": -0.8896329998970032, + "objective/rlhf_reward": -1.6111206514405567, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 8.579498291015625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8015791177749634, + "step": 660, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0010838508605957 + }, + { + "episode": 10592, + "epoch": 0.19038717331128446, + "loss/policy_avg": 0.30528926849365234, + "lr": 2.8732745398773006e-06, + "objective/entropy": -81.17588806152344, + "objective/kl": 12.422649383544922, + "objective/non_score_reward": -1.2422648668289185, + "objective/rlhf_reward": -2.569059437513351, + "objective/scores": 0.6, + "policy/approxkl_avg": 26.168109893798828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6268837451934814, + "step": 661, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000943899154663 + }, + { + "episode": 10608, + "epoch": 0.1906747672286731, + "loss/policy_avg": 0.21409085392951965, + "lr": 2.8730828220858894e-06, + "objective/entropy": 109.389404296875, + "objective/kl": 14.156780242919922, + "objective/non_score_reward": -1.415677785873413, + "objective/rlhf_reward": -1.2627113521099087, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.9066390991211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6278871297836304, + "step": 662, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998223781585693 + }, + { + "episode": 10624, + "epoch": 0.19096236114606177, + "loss/policy_avg": 0.4462759494781494, + "lr": 2.8728911042944786e-06, + "objective/entropy": 9.342247009277344, + "objective/kl": 15.820850372314453, + "objective/non_score_reward": -1.5820850133895874, + "objective/rlhf_reward": -4.205633642450843, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 166.1714630126953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4893154501914978, + "step": 663, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985361099243164 + }, + { + "episode": 10640, + "epoch": 0.1912499550634504, + "loss/policy_avg": 0.26638445258140564, + "lr": 2.8726993865030674e-06, + "objective/entropy": 177.13156127929688, + "objective/kl": 9.343158721923828, + "objective/non_score_reward": -0.9343159794807434, + "objective/rlhf_reward": -5.7372636795043945, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.281099319458008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6856767535209656, + "step": 664, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989508390426636 + }, + { + "episode": 10656, + "epoch": 0.19153754898083905, + "loss/policy_avg": 0.16023144125938416, + "lr": 2.8725076687116566e-06, + "objective/entropy": 78.05880737304688, + "objective/kl": 7.6621246337890625, + "objective/non_score_reward": -0.7662124633789062, + "objective/rlhf_reward": -2.664849868416786, + "objective/scores": 0.1, + "policy/approxkl_avg": 50.030677795410156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3968629837036133, + "step": 665, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0032896995544434 + }, + { + "episode": 10672, + "epoch": 0.1918251428982277, + "loss/policy_avg": 0.06800419837236404, + "lr": 2.8723159509202455e-06, + "objective/entropy": 59.63671875, + "objective/kl": 12.568862915039062, + "objective/non_score_reward": -1.25688636302948, + "objective/rlhf_reward": -7.02754545211792, + "objective/scores": -0.5, + "policy/approxkl_avg": 222.47463989257812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5737947821617126, + "step": 666, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0001986026763916 + }, + { + "episode": 10688, + "epoch": 0.19211273681561636, + "loss/policy_avg": 0.553016185760498, + "lr": 2.8721242331288343e-06, + "objective/entropy": 235.48336791992188, + "objective/kl": 10.159873962402344, + "objective/non_score_reward": -1.0159873962402344, + "objective/rlhf_reward": -2.3306164304415384, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 54.860572814941406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7995113134384155, + "step": 667, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0003960132598877 + }, + { + "episode": 10704, + "epoch": 0.192400330733005, + "loss/policy_avg": -0.27713173627853394, + "lr": 2.8719325153374235e-06, + "objective/entropy": 32.731258392333984, + "objective/kl": 14.078591346740723, + "objective/non_score_reward": -1.407859206199646, + "objective/rlhf_reward": -5.231436854600906, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.393226623535156, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6517434120178223, + "step": 668, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0037641525268555 + }, + { + "episode": 10720, + "epoch": 0.19268792465039364, + "loss/policy_avg": 0.40979307889938354, + "lr": 2.8717407975460123e-06, + "objective/entropy": -46.12900924682617, + "objective/kl": 10.303600311279297, + "objective/non_score_reward": -1.0303599834442139, + "objective/rlhf_reward": -3.7214399039745327, + "objective/scores": 0.1, + "policy/approxkl_avg": 37.29252624511719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6077979803085327, + "step": 669, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998400092124939 + }, + { + "episode": 10736, + "epoch": 0.19297551856778228, + "loss/policy_avg": 0.25220245122909546, + "lr": 2.8715490797546015e-06, + "objective/entropy": 105.48408508300781, + "objective/kl": 17.024581909179688, + "objective/non_score_reward": -1.702458143234253, + "objective/rlhf_reward": -4.409832602739334, + "objective/scores": 0.6, + "policy/approxkl_avg": 208.58787536621094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7229609489440918, + "step": 670, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999326467514038 + }, + { + "episode": 10752, + "epoch": 0.19326311248517095, + "loss/policy_avg": -0.009634248912334442, + "lr": 2.8713573619631903e-06, + "objective/entropy": -176.94802856445312, + "objective/kl": 12.598766326904297, + "objective/non_score_reward": -1.2598767280578613, + "objective/rlhf_reward": -4.639506882429123, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.82705307006836, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.590308666229248, + "step": 671, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989120960235596 + }, + { + "episode": 10768, + "epoch": 0.19355070640255959, + "loss/policy_avg": 0.18840017914772034, + "lr": 2.871165644171779e-06, + "objective/entropy": 130.62278747558594, + "objective/kl": 11.168953895568848, + "objective/non_score_reward": -1.1168954372406006, + "objective/rlhf_reward": -6.467581748962402, + "objective/scores": -0.5, + "policy/approxkl_avg": 15.816328048706055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6742215752601624, + "step": 672, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0026257038116455 + }, + { + "episode": 10784, + "epoch": 0.19383830031994823, + "loss/policy_avg": 0.23392519354820251, + "lr": 2.8709739263803684e-06, + "objective/entropy": 127.62983703613281, + "objective/kl": 7.803813934326172, + "objective/non_score_reward": -0.780381441116333, + "objective/rlhf_reward": -1.1741145131754236, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 32.454715728759766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6811597347259521, + "step": 673, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976636171340942 + }, + { + "episode": 10800, + "epoch": 0.1941258942373369, + "loss/policy_avg": -0.1319524347782135, + "lr": 2.870782208588957e-06, + "objective/entropy": -20.453079223632812, + "objective/kl": 13.015239715576172, + "objective/non_score_reward": -1.301524043083191, + "objective/rlhf_reward": -7.206096172332764, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.110284805297852, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7017459869384766, + "step": 674, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0026445388793945 + }, + { + "episode": 10816, + "epoch": 0.19441348815472553, + "loss/policy_avg": 0.04714903235435486, + "lr": 2.870590490797546e-06, + "objective/entropy": 303.12890625, + "objective/kl": 7.725805282592773, + "objective/non_score_reward": -0.7725805044174194, + "objective/rlhf_reward": -5.090322017669678, + "objective/scores": -0.5, + "policy/approxkl_avg": 25.820777893066406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7819321155548096, + "step": 675, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988234043121338 + }, + { + "episode": 10832, + "epoch": 0.19470108207211417, + "loss/policy_avg": 0.23574650287628174, + "lr": 2.8703987730061352e-06, + "objective/entropy": -194.20651245117188, + "objective/kl": 14.129228591918945, + "objective/non_score_reward": -1.412922978401184, + "objective/rlhf_reward": -1.2516918838024136, + "objective/scores": 1.1, + "policy/approxkl_avg": 34.932640075683594, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.635951817035675, + "step": 676, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998043775558472 + }, + { + "episode": 10848, + "epoch": 0.1949886759895028, + "loss/policy_avg": 1.922278881072998, + "lr": 2.870207055214724e-06, + "objective/entropy": 202.7659454345703, + "objective/kl": 8.443321228027344, + "objective/non_score_reward": -0.8443321585655212, + "objective/rlhf_reward": -5.377328872680664, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.10383605957031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7711433172225952, + "step": 677, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003726482391357 + }, + { + "episode": 10864, + "epoch": 0.19527626990689148, + "loss/policy_avg": 0.08129671216011047, + "lr": 2.870015337423313e-06, + "objective/entropy": -35.23242950439453, + "objective/kl": 12.633974075317383, + "objective/non_score_reward": -1.263397455215454, + "objective/rlhf_reward": -7.053589820861816, + "objective/scores": -0.5, + "policy/approxkl_avg": 148.25845336914062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8183538913726807, + "step": 678, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973478317260742 + }, + { + "episode": 10880, + "epoch": 0.19556386382428012, + "loss/policy_avg": 0.07529361546039581, + "lr": 2.8698236196319017e-06, + "objective/entropy": -160.49822998046875, + "objective/kl": 14.277109146118164, + "objective/non_score_reward": -1.4277108907699585, + "objective/rlhf_reward": -7.710843086242676, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.69904327392578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6668201684951782, + "step": 679, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9965490102767944 + }, + { + "episode": 10896, + "epoch": 0.19585145774166876, + "loss/policy_avg": -0.24820567667484283, + "lr": 2.869631901840491e-06, + "objective/entropy": -144.34962463378906, + "objective/kl": 7.242027759552002, + "objective/non_score_reward": -0.7242028117179871, + "objective/rlhf_reward": -2.4968111276626583, + "objective/scores": 0.1, + "policy/approxkl_avg": 4.443586349487305, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.5139869451522827, + "step": 680, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0061938762664795 + }, + { + "episode": 10912, + "epoch": 0.1961390516590574, + "loss/policy_avg": -0.08409806340932846, + "lr": 2.8694401840490797e-06, + "objective/entropy": -54.80754852294922, + "objective/kl": 12.144088745117188, + "objective/non_score_reward": -1.2144087553024292, + "objective/rlhf_reward": -1.9339162155401437, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.676344156265259, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8483699560165405, + "step": 681, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0018930435180664 + }, + { + "episode": 10928, + "epoch": 0.19642664557644607, + "loss/policy_avg": 0.12138275802135468, + "lr": 2.8692484662576685e-06, + "objective/entropy": 70.32062530517578, + "objective/kl": 12.482439041137695, + "objective/non_score_reward": -1.248243808746338, + "objective/rlhf_reward": -4.592975145578384, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.664492130279541, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5509634017944336, + "step": 682, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0010645389556885 + }, + { + "episode": 10944, + "epoch": 0.1967142394938347, + "loss/policy_avg": 0.2954164147377014, + "lr": 2.8690567484662578e-06, + "objective/entropy": 46.32169723510742, + "objective/kl": 7.934720039367676, + "objective/non_score_reward": -0.7934720516204834, + "objective/rlhf_reward": 1.2261117786169056, + "objective/scores": 1.1, + "policy/approxkl_avg": 69.41313934326172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5250649452209473, + "step": 683, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001229763031006 + }, + { + "episode": 10960, + "epoch": 0.19700183341122335, + "loss/policy_avg": 0.33847346901893616, + "lr": 2.8688650306748466e-06, + "objective/entropy": 136.00357055664062, + "objective/kl": 11.347841262817383, + "objective/non_score_reward": -1.1347841024398804, + "objective/rlhf_reward": -0.13913632035255397, + "objective/scores": 1.1, + "policy/approxkl_avg": 39.6363525390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5623573064804077, + "step": 684, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99955153465271 + }, + { + "episode": 10976, + "epoch": 0.197289427328612, + "loss/policy_avg": 0.0871865451335907, + "lr": 2.868673312883436e-06, + "objective/entropy": -171.5851593017578, + "objective/kl": 11.16063404083252, + "objective/non_score_reward": -1.1160634756088257, + "objective/rlhf_reward": -4.064253827929496, + "objective/scores": 0.1, + "policy/approxkl_avg": 69.5920181274414, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.790228009223938, + "step": 685, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9968866109848022 + }, + { + "episode": 10992, + "epoch": 0.19757702124600066, + "loss/policy_avg": 0.17770184576511383, + "lr": 2.8684815950920246e-06, + "objective/entropy": 117.78619384765625, + "objective/kl": 14.742965698242188, + "objective/non_score_reward": -1.4742965698242188, + "objective/rlhf_reward": -5.497186398506164, + "objective/scores": 0.1, + "policy/approxkl_avg": 171.13133239746094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5882231593132019, + "step": 686, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990973472595215 + }, + { + "episode": 11008, + "epoch": 0.1978646151633893, + "loss/policy_avg": 0.4302918314933777, + "lr": 2.8682898773006134e-06, + "objective/entropy": 86.18273162841797, + "objective/kl": 11.427553176879883, + "objective/non_score_reward": -1.142755389213562, + "objective/rlhf_reward": -6.571021556854248, + "objective/scores": -0.5, + "policy/approxkl_avg": 177.87432861328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.468346506357193, + "step": 687, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976341724395752 + }, + { + "episode": 11024, + "epoch": 0.19815220908077794, + "loss/policy_avg": 0.49217331409454346, + "lr": 2.8680981595092026e-06, + "objective/entropy": -60.496864318847656, + "objective/kl": 19.168210983276367, + "objective/non_score_reward": -1.9168212413787842, + "objective/rlhf_reward": -5.54457879282621, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 153.42633056640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6640913486480713, + "step": 688, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000704765319824 + }, + { + "episode": 11040, + "epoch": 0.19843980299816658, + "loss/policy_avg": 0.5149247646331787, + "lr": 2.8679064417177915e-06, + "objective/entropy": 217.9249725341797, + "objective/kl": 10.10614013671875, + "objective/non_score_reward": -1.010614037513733, + "objective/rlhf_reward": 0.3575438499450687, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.427753448486328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5551241636276245, + "step": 689, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004658699035645 + }, + { + "episode": 11056, + "epoch": 0.19872739691555524, + "loss/policy_avg": 2.387340784072876, + "lr": 2.8677147239263803e-06, + "objective/entropy": 90.62098693847656, + "objective/kl": 13.577856063842773, + "objective/non_score_reward": -1.357785701751709, + "objective/rlhf_reward": -5.031142449378967, + "objective/scores": 0.1, + "policy/approxkl_avg": 76.07826232910156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5787378549575806, + "step": 690, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003950595855713 + }, + { + "episode": 11072, + "epoch": 0.19901499083294388, + "loss/policy_avg": 0.07591082900762558, + "lr": 2.8675230061349695e-06, + "objective/entropy": 192.91531372070312, + "objective/kl": 18.27234649658203, + "objective/non_score_reward": -1.8272345066070557, + "objective/rlhf_reward": -5.186231674925361, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 53.839683532714844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5381182432174683, + "step": 691, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999629259109497 + }, + { + "episode": 11088, + "epoch": 0.19930258475033252, + "loss/policy_avg": 0.24866509437561035, + "lr": 2.8673312883435583e-06, + "objective/entropy": 162.3503875732422, + "objective/kl": 18.564533233642578, + "objective/non_score_reward": -1.8564532995224, + "objective/rlhf_reward": -3.025813496112823, + "objective/scores": 1.1, + "policy/approxkl_avg": 120.78541564941406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5609779357910156, + "step": 692, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9961520433425903 + }, + { + "episode": 11104, + "epoch": 0.1995901786677212, + "loss/policy_avg": -0.0011049304157495499, + "lr": 2.8671395705521475e-06, + "objective/entropy": 96.88809967041016, + "objective/kl": 7.879084587097168, + "objective/non_score_reward": -0.7879084944725037, + "objective/rlhf_reward": -2.7516339480876923, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.773414611816406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.45891791582107544, + "step": 693, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0016512870788574 + }, + { + "episode": 11120, + "epoch": 0.19987777258510983, + "loss/policy_avg": 0.10084787011146545, + "lr": 2.8669478527607364e-06, + "objective/entropy": -41.20310974121094, + "objective/kl": 13.97619915008545, + "objective/non_score_reward": -1.3976198434829712, + "objective/rlhf_reward": -1.190479493141174, + "objective/scores": 1.1, + "policy/approxkl_avg": 52.299827575683594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5506526231765747, + "step": 694, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993311166763306 + }, + { + "episode": 11136, + "epoch": 0.20016536650249847, + "loss/policy_avg": 0.06375744938850403, + "lr": 2.866756134969325e-06, + "objective/entropy": -108.92198181152344, + "objective/kl": 9.946971893310547, + "objective/non_score_reward": -0.9946972131729126, + "objective/rlhf_reward": -5.97878885269165, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.416961193084717, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5260124802589417, + "step": 695, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992403984069824 + }, + { + "episode": 11152, + "epoch": 0.2004529604198871, + "loss/policy_avg": 0.08030396699905396, + "lr": 2.8665644171779144e-06, + "objective/entropy": 123.5355453491211, + "objective/kl": 15.836409568786621, + "objective/non_score_reward": -1.5836410522460938, + "objective/rlhf_reward": -8.334564208984375, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.76978302001953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6828103065490723, + "step": 696, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998550415039062 + }, + { + "episode": 11168, + "epoch": 0.20074055433727578, + "loss/policy_avg": 0.29151037335395813, + "lr": 2.866372699386503e-06, + "objective/entropy": -57.21668243408203, + "objective/kl": 15.339473724365234, + "objective/non_score_reward": -1.5339473485946655, + "objective/rlhf_reward": -1.7357893645763394, + "objective/scores": 1.1, + "policy/approxkl_avg": 91.99110412597656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5846511721611023, + "step": 697, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978073835372925 + }, + { + "episode": 11184, + "epoch": 0.20102814825466442, + "loss/policy_avg": 0.11156813055276871, + "lr": 2.8661809815950924e-06, + "objective/entropy": 4.086456298828125, + "objective/kl": 12.176067352294922, + "objective/non_score_reward": -1.2176066637039185, + "objective/rlhf_reward": -4.470426669716835, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.13668441772461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7282131910324097, + "step": 698, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009164810180664 + }, + { + "episode": 11200, + "epoch": 0.20131574217205306, + "loss/policy_avg": 0.30346500873565674, + "lr": 2.8659892638036812e-06, + "objective/entropy": -137.93194580078125, + "objective/kl": 9.752511978149414, + "objective/non_score_reward": -0.975251317024231, + "objective/rlhf_reward": -3.501005089282989, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.240784645080566, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4937957525253296, + "step": 699, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995919466018677 + }, + { + "episode": 11216, + "epoch": 0.2016033360894417, + "loss/policy_avg": 0.2282610535621643, + "lr": 2.86579754601227e-06, + "objective/entropy": 57.311073303222656, + "objective/kl": 14.862542152404785, + "objective/non_score_reward": -1.486254334449768, + "objective/rlhf_reward": -4.120188410553049, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 18.98688507080078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6022450923919678, + "step": 700, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996930360794067 + }, + { + "episode": 11232, + "epoch": 0.20189093000683037, + "loss/policy_avg": 0.3324587941169739, + "lr": 2.865605828220859e-06, + "objective/entropy": 49.462989807128906, + "objective/kl": 7.709723472595215, + "objective/non_score_reward": -0.7709723711013794, + "objective/rlhf_reward": -5.083889484405518, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.05154800415039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41537797451019287, + "step": 701, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002284049987793 + }, + { + "episode": 11248, + "epoch": 0.202178523924219, + "loss/policy_avg": 0.16259673237800598, + "lr": 2.8654141104294477e-06, + "objective/entropy": 60.31419372558594, + "objective/kl": 13.977958679199219, + "objective/non_score_reward": -1.3977960348129272, + "objective/rlhf_reward": -5.191184258460998, + "objective/scores": 0.1, + "policy/approxkl_avg": 202.6460418701172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7732006311416626, + "step": 702, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978877305984497 + }, + { + "episode": 11264, + "epoch": 0.20246611784160765, + "loss/policy_avg": 0.2512626051902771, + "lr": 2.865222392638037e-06, + "objective/entropy": 156.12353515625, + "objective/kl": 16.06157112121582, + "objective/non_score_reward": -1.6061570644378662, + "objective/rlhf_reward": -3.50090917641041, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 191.80953979492188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6488738059997559, + "step": 703, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996211528778076 + }, + { + "episode": 11280, + "epoch": 0.20275371175899629, + "loss/policy_avg": 0.15953761339187622, + "lr": 2.8650306748466257e-06, + "objective/entropy": -43.2182731628418, + "objective/kl": 16.940393447875977, + "objective/non_score_reward": -1.6940394639968872, + "objective/rlhf_reward": -2.3761578559875485, + "objective/scores": 1.1, + "policy/approxkl_avg": 112.27688598632812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.45469629764556885, + "step": 704, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988117218017578 + }, + { + "episode": 11296, + "epoch": 0.20304130567638495, + "loss/policy_avg": 0.6711443066596985, + "lr": 2.8648389570552145e-06, + "objective/entropy": -33.72349548339844, + "objective/kl": 11.539321899414062, + "objective/non_score_reward": -1.1539320945739746, + "objective/rlhf_reward": -2.8823953429857885, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 61.21073913574219, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5703375339508057, + "step": 705, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978996515274048 + }, + { + "episode": 11312, + "epoch": 0.2033288995937736, + "loss/policy_avg": 0.7480028867721558, + "lr": 2.8646472392638038e-06, + "objective/entropy": 220.5673828125, + "objective/kl": 7.203307151794434, + "objective/non_score_reward": -0.7203306555747986, + "objective/rlhf_reward": -1.2772026694455916, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 139.5849609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.786684513092041, + "step": 706, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992549419403076 + }, + { + "episode": 11328, + "epoch": 0.20361649351116223, + "loss/policy_avg": 0.00046522170305252075, + "lr": 2.8644555214723926e-06, + "objective/entropy": 77.21452331542969, + "objective/kl": 15.027108192443848, + "objective/non_score_reward": -1.5027105808258057, + "objective/rlhf_reward": -8.010842323303223, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.50825500488281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7454473972320557, + "step": 707, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993491172790527 + }, + { + "episode": 11344, + "epoch": 0.20390408742855087, + "loss/policy_avg": 0.4308600425720215, + "lr": 2.864263803680982e-06, + "objective/entropy": 105.22822570800781, + "objective/kl": 10.61517333984375, + "objective/non_score_reward": -1.0615174770355225, + "objective/rlhf_reward": -1.3223507746469703, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 15.070667266845703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5496830940246582, + "step": 708, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998878836631775 + }, + { + "episode": 11360, + "epoch": 0.20419168134593954, + "loss/policy_avg": 0.02560766041278839, + "lr": 2.8640720858895706e-06, + "objective/entropy": 52.49213790893555, + "objective/kl": 12.071894645690918, + "objective/non_score_reward": -1.2071894407272339, + "objective/rlhf_reward": -1.9050386294138162, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 22.58022689819336, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.678420901298523, + "step": 709, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0016746520996094 + }, + { + "episode": 11376, + "epoch": 0.20447927526332818, + "loss/policy_avg": 0.14677026867866516, + "lr": 2.8638803680981594e-06, + "objective/entropy": -44.507568359375, + "objective/kl": 10.297760009765625, + "objective/non_score_reward": -1.0297759771347046, + "objective/rlhf_reward": -6.119103908538818, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.161418914794922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69853675365448, + "step": 710, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988245964050293 + }, + { + "episode": 11392, + "epoch": 0.20476686918071682, + "loss/policy_avg": 0.3421841859817505, + "lr": 2.8636886503067487e-06, + "objective/entropy": 1.61920166015625, + "objective/kl": 10.758232116699219, + "objective/non_score_reward": -1.0758233070373535, + "objective/rlhf_reward": -6.303293228149414, + "objective/scores": -0.5, + "policy/approxkl_avg": 68.81875610351562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.852238118648529, + "step": 711, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968268871307373 + }, + { + "episode": 11408, + "epoch": 0.2050544630981055, + "loss/policy_avg": 0.48776674270629883, + "lr": 2.8634969325153375e-06, + "objective/entropy": -14.844409942626953, + "objective/kl": 10.682808876037598, + "objective/non_score_reward": -1.0682809352874756, + "objective/rlhf_reward": -3.873123502731323, + "objective/scores": 0.1, + "policy/approxkl_avg": 63.03434371948242, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7068368196487427, + "step": 712, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9972708225250244 + }, + { + "episode": 11424, + "epoch": 0.20534205701549413, + "loss/policy_avg": 0.8941645622253418, + "lr": 2.8633052147239263e-06, + "objective/entropy": -93.52375030517578, + "objective/kl": 6.7023420333862305, + "objective/non_score_reward": -0.6702341437339783, + "objective/rlhf_reward": -4.680936813354492, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.55957794189453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7821812629699707, + "step": 713, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999279975891113 + }, + { + "episode": 11440, + "epoch": 0.20562965093288277, + "loss/policy_avg": 0.5883785486221313, + "lr": 2.8631134969325155e-06, + "objective/entropy": -66.35493469238281, + "objective/kl": 16.39910125732422, + "objective/non_score_reward": -1.6399102210998535, + "objective/rlhf_reward": -6.159640645980835, + "objective/scores": 0.1, + "policy/approxkl_avg": 99.94831848144531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5211118459701538, + "step": 714, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999578833580017 + }, + { + "episode": 11456, + "epoch": 0.2059172448502714, + "loss/policy_avg": 0.673638105392456, + "lr": 2.8629217791411043e-06, + "objective/entropy": 56.791744232177734, + "objective/kl": 13.02347183227539, + "objective/non_score_reward": -1.302347183227539, + "objective/rlhf_reward": -7.209388732910156, + "objective/scores": -0.5, + "policy/approxkl_avg": 57.524818420410156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6059812307357788, + "step": 715, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978859424591064 + }, + { + "episode": 11472, + "epoch": 0.20620483876766008, + "loss/policy_avg": 0.033942222595214844, + "lr": 2.8627300613496936e-06, + "objective/entropy": -125.00245666503906, + "objective/kl": 16.061996459960938, + "objective/non_score_reward": -1.6061995029449463, + "objective/rlhf_reward": -2.0247983470559117, + "objective/scores": 1.1, + "policy/approxkl_avg": 95.87913513183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49507004022598267, + "step": 716, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000415563583374 + }, + { + "episode": 11488, + "epoch": 0.20649243268504872, + "loss/policy_avg": 0.32607385516166687, + "lr": 2.8625383435582824e-06, + "objective/entropy": 165.55465698242188, + "objective/kl": 17.082618713378906, + "objective/non_score_reward": -1.7082619667053223, + "objective/rlhf_reward": -5.008219312104295, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 79.94935607910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5599914789199829, + "step": 717, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984850883483887 + }, + { + "episode": 11504, + "epoch": 0.20678002660243736, + "loss/policy_avg": 0.18473944067955017, + "lr": 2.862346625766871e-06, + "objective/entropy": 144.7602996826172, + "objective/kl": 13.379175186157227, + "objective/non_score_reward": -1.3379178047180176, + "objective/rlhf_reward": -4.951671248674392, + "objective/scores": 0.1, + "policy/approxkl_avg": 234.73941040039062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8746376037597656, + "step": 718, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992458820343018 + }, + { + "episode": 11520, + "epoch": 0.207067620519826, + "loss/policy_avg": 0.3600703775882721, + "lr": 2.8621549079754604e-06, + "objective/entropy": 63.41236877441406, + "objective/kl": 14.644229888916016, + "objective/non_score_reward": -1.4644229412078857, + "objective/rlhf_reward": -5.457692122459411, + "objective/scores": 0.1, + "policy/approxkl_avg": 130.00485229492188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.47121816873550415, + "step": 719, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991137981414795 + }, + { + "episode": 11536, + "epoch": 0.20735521443721466, + "loss/policy_avg": 0.23683911561965942, + "lr": 2.8619631901840492e-06, + "objective/entropy": 5.525566101074219, + "objective/kl": 11.11286735534668, + "objective/non_score_reward": -1.1112868785858154, + "objective/rlhf_reward": -2.8888880302577764, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 63.8202018737793, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.42511242628097534, + "step": 720, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9963274002075195 + }, + { + "episode": 11552, + "epoch": 0.2076428083546033, + "loss/policy_avg": 0.6022622585296631, + "lr": 2.8617714723926384e-06, + "objective/entropy": -102.19918823242188, + "objective/kl": 15.452737808227539, + "objective/non_score_reward": -1.545273780822754, + "objective/rlhf_reward": -3.257376019598219, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 49.32636642456055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7975940704345703, + "step": 721, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9964704513549805 + }, + { + "episode": 11568, + "epoch": 0.20793040227199194, + "loss/policy_avg": 0.24385926127433777, + "lr": 2.8615797546012273e-06, + "objective/entropy": 32.55232238769531, + "objective/kl": 14.009422302246094, + "objective/non_score_reward": -1.400942325592041, + "objective/rlhf_reward": -3.2037693619728085, + "objective/scores": 0.6, + "policy/approxkl_avg": 153.17926025390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8219155073165894, + "step": 722, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0011699199676514 + }, + { + "episode": 11584, + "epoch": 0.20821799618938058, + "loss/policy_avg": 0.3180992007255554, + "lr": 2.861388036809816e-06, + "objective/entropy": 72.60052490234375, + "objective/kl": 15.588754653930664, + "objective/non_score_reward": -1.558875560760498, + "objective/rlhf_reward": -8.235502243041992, + "objective/scores": -0.5, + "policy/approxkl_avg": 123.99591064453125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7457439303398132, + "step": 723, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982542991638184 + }, + { + "episode": 11600, + "epoch": 0.20850559010676925, + "loss/policy_avg": 0.08571982383728027, + "lr": 2.861196319018405e-06, + "objective/entropy": 43.14987564086914, + "objective/kl": 17.108150482177734, + "objective/non_score_reward": -1.7108149528503418, + "objective/rlhf_reward": -2.4432600051164624, + "objective/scores": 1.1, + "policy/approxkl_avg": 57.58824920654297, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.49476999044418335, + "step": 724, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984831809997559 + }, + { + "episode": 11616, + "epoch": 0.2087931840241579, + "loss/policy_avg": 0.21315881609916687, + "lr": 2.8610046012269937e-06, + "objective/entropy": -128.3315887451172, + "objective/kl": 15.486307144165039, + "objective/non_score_reward": -1.548630714416504, + "objective/rlhf_reward": -1.7945227384567257, + "objective/scores": 1.1, + "policy/approxkl_avg": 213.23350524902344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5425740480422974, + "step": 725, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989008903503418 + }, + { + "episode": 11632, + "epoch": 0.20908077794154653, + "loss/policy_avg": 0.5638971328735352, + "lr": 2.860812883435583e-06, + "objective/entropy": 91.92890930175781, + "objective/kl": 10.46470832824707, + "objective/non_score_reward": -1.0464708805084229, + "objective/rlhf_reward": -3.7858832538127896, + "objective/scores": 0.1, + "policy/approxkl_avg": 65.73753356933594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7266730070114136, + "step": 726, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996936321258545 + }, + { + "episode": 11648, + "epoch": 0.20936837185893517, + "loss/policy_avg": 0.33472561836242676, + "lr": 2.8606211656441717e-06, + "objective/entropy": 194.0995330810547, + "objective/kl": 17.15127182006836, + "objective/non_score_reward": -1.7151273488998413, + "objective/rlhf_reward": -6.460509246587753, + "objective/scores": 0.1, + "policy/approxkl_avg": 155.51809692382812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7743821144104004, + "step": 727, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982187747955322 + }, + { + "episode": 11664, + "epoch": 0.20965596577632384, + "loss/policy_avg": 0.0969974622130394, + "lr": 2.8604294478527605e-06, + "objective/entropy": 252.77114868164062, + "objective/kl": 13.091619491577148, + "objective/non_score_reward": -1.3091620206832886, + "objective/rlhf_reward": -2.3129289641391964, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 98.46098327636719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9146468639373779, + "step": 728, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998947262763977 + }, + { + "episode": 11680, + "epoch": 0.20994355969371248, + "loss/policy_avg": 1.0030015707015991, + "lr": 2.8602377300613498e-06, + "objective/entropy": -29.64310073852539, + "objective/kl": 9.100172996520996, + "objective/non_score_reward": -0.9100174307823181, + "objective/rlhf_reward": -5.640069961547852, + "objective/scores": -0.5, + "policy/approxkl_avg": 66.61669921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6575506329536438, + "step": 729, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992115497589111 + }, + { + "episode": 11696, + "epoch": 0.21023115361110112, + "loss/policy_avg": 0.07021422684192657, + "lr": 2.8600460122699386e-06, + "objective/entropy": -20.677194595336914, + "objective/kl": 11.13044548034668, + "objective/non_score_reward": -1.1130445003509521, + "objective/rlhf_reward": -6.452178001403809, + "objective/scores": -0.5, + "policy/approxkl_avg": 20.34206771850586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4357107877731323, + "step": 730, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978418350219727 + }, + { + "episode": 11712, + "epoch": 0.21051874752848979, + "loss/policy_avg": 0.11832322180271149, + "lr": 2.859854294478528e-06, + "objective/entropy": -303.6877136230469, + "objective/kl": 14.444772720336914, + "objective/non_score_reward": -1.4444773197174072, + "objective/rlhf_reward": -5.377909517288208, + "objective/scores": 0.1, + "policy/approxkl_avg": 57.83880615234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7242922782897949, + "step": 731, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0005035400390625 + }, + { + "episode": 11728, + "epoch": 0.21080634144587843, + "loss/policy_avg": 0.2551548480987549, + "lr": 2.8596625766871166e-06, + "objective/entropy": 227.5770721435547, + "objective/kl": 18.140262603759766, + "objective/non_score_reward": -1.8140263557434082, + "objective/rlhf_reward": -4.332386408687803, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 55.3328857421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6282124519348145, + "step": 732, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000762939453125 + }, + { + "episode": 11744, + "epoch": 0.21109393536326707, + "loss/policy_avg": 0.02594660222530365, + "lr": 2.8594708588957054e-06, + "objective/entropy": 29.276161193847656, + "objective/kl": 18.943225860595703, + "objective/non_score_reward": -1.8943226337432861, + "objective/rlhf_reward": -9.577290534973145, + "objective/scores": -0.5, + "policy/approxkl_avg": 128.90655517578125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6192996501922607, + "step": 733, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997767448425293 + }, + { + "episode": 11760, + "epoch": 0.2113815292806557, + "loss/policy_avg": 0.03574896976351738, + "lr": 2.8592791411042947e-06, + "objective/entropy": -71.63544464111328, + "objective/kl": 11.780060768127441, + "objective/non_score_reward": -1.1780060529708862, + "objective/rlhf_reward": -1.788305324257585, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 23.68338394165039, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6172910332679749, + "step": 734, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973077774047852 + }, + { + "episode": 11776, + "epoch": 0.21166912319804437, + "loss/policy_avg": 0.6160891056060791, + "lr": 2.8590874233128835e-06, + "objective/entropy": -90.55964660644531, + "objective/kl": 15.64774227142334, + "objective/non_score_reward": -1.5647742748260498, + "objective/rlhf_reward": -3.8590969800949093, + "objective/scores": 0.6, + "policy/approxkl_avg": 52.503440856933594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6182564496994019, + "step": 735, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989289045333862 + }, + { + "episode": 11792, + "epoch": 0.211956717115433, + "loss/policy_avg": 0.3826139569282532, + "lr": 2.8588957055214727e-06, + "objective/entropy": -80.17495727539062, + "objective/kl": 13.256806373596191, + "objective/non_score_reward": -1.3256807327270508, + "objective/rlhf_reward": -2.3790036782037944, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 100.29702758789062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7060404419898987, + "step": 736, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999138593673706 + }, + { + "episode": 11808, + "epoch": 0.21224431103282165, + "loss/policy_avg": -0.032949626445770264, + "lr": 2.8587039877300615e-06, + "objective/entropy": -176.73757934570312, + "objective/kl": 8.910408973693848, + "objective/non_score_reward": -0.8910409212112427, + "objective/rlhf_reward": -5.564163684844971, + "objective/scores": -0.5, + "policy/approxkl_avg": 92.01910400390625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.49018532037734985, + "step": 737, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0024096965789795 + }, + { + "episode": 11824, + "epoch": 0.2125319049502103, + "loss/policy_avg": 0.06443023681640625, + "lr": 2.8585122699386503e-06, + "objective/entropy": -30.911895751953125, + "objective/kl": 12.735221862792969, + "objective/non_score_reward": -1.2735222578048706, + "objective/rlhf_reward": -0.6940891504287716, + "objective/scores": 1.1, + "policy/approxkl_avg": 67.95742797851562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6443949341773987, + "step": 738, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994258880615234 + }, + { + "episode": 11840, + "epoch": 0.21281949886759896, + "loss/policy_avg": 0.8280965089797974, + "lr": 2.8583205521472396e-06, + "objective/entropy": -47.22200012207031, + "objective/kl": 11.071340560913086, + "objective/non_score_reward": -1.1071341037750244, + "objective/rlhf_reward": -6.428536415100098, + "objective/scores": -0.5, + "policy/approxkl_avg": 87.060302734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5341061353683472, + "step": 739, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975091218948364 + }, + { + "episode": 11856, + "epoch": 0.2131070927849876, + "loss/policy_avg": 0.17557145655155182, + "lr": 2.8581288343558284e-06, + "objective/entropy": -33.48394775390625, + "objective/kl": 12.476408004760742, + "objective/non_score_reward": -1.24764084815979, + "objective/rlhf_reward": -2.0668442777704925, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 54.93560791015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5069207549095154, + "step": 740, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968397617340088 + }, + { + "episode": 11872, + "epoch": 0.21339468670237624, + "loss/policy_avg": 0.07476645708084106, + "lr": 2.857937116564417e-06, + "objective/entropy": -147.0701904296875, + "objective/kl": 16.620323181152344, + "objective/non_score_reward": -1.6620323657989502, + "objective/rlhf_reward": -6.248129403591156, + "objective/scores": 0.1, + "policy/approxkl_avg": 201.01736450195312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.700377345085144, + "step": 741, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000122547149658 + }, + { + "episode": 11888, + "epoch": 0.21368228061976488, + "loss/policy_avg": 0.13703355193138123, + "lr": 2.8577453987730064e-06, + "objective/entropy": -27.77845001220703, + "objective/kl": 11.785709381103516, + "objective/non_score_reward": -1.1785709857940674, + "objective/rlhf_reward": -4.314283764362335, + "objective/scores": 0.1, + "policy/approxkl_avg": 119.8775863647461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6977899074554443, + "step": 742, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989674091339111 + }, + { + "episode": 11904, + "epoch": 0.21396987453715355, + "loss/policy_avg": 0.5084174871444702, + "lr": 2.8575536809815952e-06, + "objective/entropy": -178.72799682617188, + "objective/kl": 6.797292709350586, + "objective/non_score_reward": -0.6797292828559875, + "objective/rlhf_reward": -2.3189171761274334, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.976099014282227, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.889729380607605, + "step": 743, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9978809356689453 + }, + { + "episode": 11920, + "epoch": 0.2142574684545422, + "loss/policy_avg": 0.08717440068721771, + "lr": 2.8573619631901845e-06, + "objective/entropy": -100.47814178466797, + "objective/kl": 11.974678993225098, + "objective/non_score_reward": -1.1974678039550781, + "objective/rlhf_reward": -4.38987118601799, + "objective/scores": 0.1, + "policy/approxkl_avg": 116.80694580078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6132055521011353, + "step": 744, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9981837272644043 + }, + { + "episode": 11936, + "epoch": 0.21454506237193083, + "loss/policy_avg": 0.1440531611442566, + "lr": 2.857170245398773e-06, + "objective/entropy": -37.25544738769531, + "objective/kl": 12.166690826416016, + "objective/non_score_reward": -1.2166690826416016, + "objective/rlhf_reward": -1.9429572268736093, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 31.051591873168945, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7306356430053711, + "step": 745, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000868558883667 + }, + { + "episode": 11952, + "epoch": 0.21483265628931947, + "loss/policy_avg": 0.22620095312595367, + "lr": 2.856978527607362e-06, + "objective/entropy": -61.764095306396484, + "objective/kl": 12.740127563476562, + "objective/non_score_reward": -1.274012804031372, + "objective/rlhf_reward": -0.6960513353347775, + "objective/scores": 1.1, + "policy/approxkl_avg": 113.7810287475586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7009084224700928, + "step": 746, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981598854064941 + }, + { + "episode": 11968, + "epoch": 0.21512025020670814, + "loss/policy_avg": 0.14071348309516907, + "lr": 2.856786809815951e-06, + "objective/entropy": 29.96725845336914, + "objective/kl": 12.842681884765625, + "objective/non_score_reward": -1.2842683792114258, + "objective/rlhf_reward": -2.213354293943617, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.990078926086426, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4330785572528839, + "step": 747, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996414184570312 + }, + { + "episode": 11984, + "epoch": 0.21540784412409678, + "loss/policy_avg": 0.24370655417442322, + "lr": 2.8565950920245397e-06, + "objective/entropy": -147.6442108154297, + "objective/kl": 16.267515182495117, + "objective/non_score_reward": -1.6267515420913696, + "objective/rlhf_reward": -2.107005929946899, + "objective/scores": 1.1, + "policy/approxkl_avg": 191.24424743652344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5275372862815857, + "step": 748, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9969547986984253 + }, + { + "episode": 12000, + "epoch": 0.21569543804148542, + "loss/policy_avg": 0.16116216778755188, + "lr": 2.856403374233129e-06, + "objective/entropy": -21.843975067138672, + "objective/kl": 13.669893264770508, + "objective/non_score_reward": -1.3669893741607666, + "objective/rlhf_reward": -7.467957496643066, + "objective/scores": -0.5, + "policy/approxkl_avg": 52.672183990478516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9156934022903442, + "step": 749, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0002281665802 + }, + { + "episode": 12016, + "epoch": 0.21598303195887408, + "loss/policy_avg": 0.539941668510437, + "lr": 2.8562116564417177e-06, + "objective/entropy": 195.0397186279297, + "objective/kl": 7.716229438781738, + "objective/non_score_reward": -0.7716230154037476, + "objective/rlhf_reward": -5.086491584777832, + "objective/scores": -0.5, + "policy/approxkl_avg": 57.51429748535156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7705237865447998, + "step": 750, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006752014160156 + }, + { + "episode": 12032, + "epoch": 0.21627062587626272, + "loss/policy_avg": 0.711264967918396, + "lr": 2.8560199386503065e-06, + "objective/entropy": -299.9069519042969, + "objective/kl": 12.776535987854004, + "objective/non_score_reward": -1.277653694152832, + "objective/rlhf_reward": -4.710614657402038, + "objective/scores": 0.1, + "policy/approxkl_avg": 82.51051330566406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6128495335578918, + "step": 751, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9975297451019287 + }, + { + "episode": 12048, + "epoch": 0.21655821979365136, + "loss/policy_avg": 0.4313165545463562, + "lr": 2.8558282208588958e-06, + "objective/entropy": 8.31052017211914, + "objective/kl": 17.218887329101562, + "objective/non_score_reward": -1.7218886613845825, + "objective/rlhf_reward": -3.9638356163513393, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 92.13627624511719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56125807762146, + "step": 752, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996793270111084 + }, + { + "episode": 12064, + "epoch": 0.21684581371104, + "loss/policy_avg": 0.2670312821865082, + "lr": 2.8556365030674846e-06, + "objective/entropy": 1.7406082153320312, + "objective/kl": 10.113969802856445, + "objective/non_score_reward": -1.011396884918213, + "objective/rlhf_reward": -2.0981764336692645, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 66.017822265625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6029417514801025, + "step": 753, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002827644348145 + }, + { + "episode": 12080, + "epoch": 0.21713340762842867, + "loss/policy_avg": 0.22361746430397034, + "lr": 2.855444785276074e-06, + "objective/entropy": -6.838325500488281, + "objective/kl": 9.408108711242676, + "objective/non_score_reward": -0.9408108592033386, + "objective/rlhf_reward": -3.363243496417999, + "objective/scores": 0.1, + "policy/approxkl_avg": 45.14617156982422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7355029582977295, + "step": 754, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000067949295044 + }, + { + "episode": 12096, + "epoch": 0.2174210015458173, + "loss/policy_avg": 0.4131731688976288, + "lr": 2.8552530674846626e-06, + "objective/entropy": -199.2462158203125, + "objective/kl": 11.438251495361328, + "objective/non_score_reward": -1.1438250541687012, + "objective/rlhf_reward": -6.575300216674805, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.276948928833008, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49213117361068726, + "step": 755, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0014610290527344 + }, + { + "episode": 12112, + "epoch": 0.21770859546320595, + "loss/policy_avg": 0.2563888430595398, + "lr": 2.8550613496932514e-06, + "objective/entropy": -50.35034942626953, + "objective/kl": 10.818819046020508, + "objective/non_score_reward": -1.0818817615509033, + "objective/rlhf_reward": -2.7234071231523327, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 5.181286811828613, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6795003414154053, + "step": 756, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993948936462402 + }, + { + "episode": 12128, + "epoch": 0.2179961893805946, + "loss/policy_avg": 0.27015259861946106, + "lr": 2.8548696319018407e-06, + "objective/entropy": 50.269439697265625, + "objective/kl": 11.087736129760742, + "objective/non_score_reward": -1.1087735891342163, + "objective/rlhf_reward": -0.035094296932220104, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.901422500610352, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6422553062438965, + "step": 757, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998644471168518 + }, + { + "episode": 12144, + "epoch": 0.21828378329798326, + "loss/policy_avg": 0.6295909881591797, + "lr": 2.8546779141104295e-06, + "objective/entropy": 40.25965118408203, + "objective/kl": 14.175272941589355, + "objective/non_score_reward": -1.417527198791504, + "objective/rlhf_reward": -2.7463900640022487, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 221.42971801757812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5903568267822266, + "step": 758, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998144268989563 + }, + { + "episode": 12160, + "epoch": 0.2185713772153719, + "loss/policy_avg": 0.6265522241592407, + "lr": 2.8544861963190187e-06, + "objective/entropy": -87.25729370117188, + "objective/kl": 10.555778503417969, + "objective/non_score_reward": -1.0555777549743652, + "objective/rlhf_reward": 0.17768906950950658, + "objective/scores": 1.1, + "policy/approxkl_avg": 90.87506866455078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6326598525047302, + "step": 759, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0008277893066406 + }, + { + "episode": 12176, + "epoch": 0.21885897113276054, + "loss/policy_avg": 0.12339673936367035, + "lr": 2.8542944785276075e-06, + "objective/entropy": -33.543636322021484, + "objective/kl": 7.962390899658203, + "objective/non_score_reward": -0.7962390184402466, + "objective/rlhf_reward": -2.784956073760986, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.08357810974121, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8025330901145935, + "step": 760, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9965462684631348 + }, + { + "episode": 12192, + "epoch": 0.21914656505014918, + "loss/policy_avg": 0.1615457832813263, + "lr": 2.8541027607361963e-06, + "objective/entropy": 145.47601318359375, + "objective/kl": 15.407791137695312, + "objective/non_score_reward": -1.5407792329788208, + "objective/rlhf_reward": -8.163117408752441, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.53536224365234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5605360269546509, + "step": 761, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999860525131226 + }, + { + "episode": 12208, + "epoch": 0.21943415896753785, + "loss/policy_avg": 0.30102694034576416, + "lr": 2.8539110429447856e-06, + "objective/entropy": 27.323680877685547, + "objective/kl": 3.660177707672119, + "objective/non_score_reward": -0.36601775884628296, + "objective/rlhf_reward": 2.935928934812546, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.197434902191162, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.36402425169944763, + "step": 762, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990768432617188 + }, + { + "episode": 12224, + "epoch": 0.21972175288492649, + "loss/policy_avg": 0.18315057456493378, + "lr": 2.8537193251533744e-06, + "objective/entropy": 72.46862030029297, + "objective/kl": 10.31401252746582, + "objective/non_score_reward": -1.0314011573791504, + "objective/rlhf_reward": -1.7256047189235686, + "objective/scores": 0.6, + "policy/approxkl_avg": 9.106523513793945, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7303919792175293, + "step": 763, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000040531158447 + }, + { + "episode": 12240, + "epoch": 0.22000934680231513, + "loss/policy_avg": 0.5514622330665588, + "lr": 2.853527607361963e-06, + "objective/entropy": -74.59797668457031, + "objective/kl": 14.129312515258789, + "objective/non_score_reward": -1.4129313230514526, + "objective/rlhf_reward": -1.2517252624034878, + "objective/scores": 1.1, + "policy/approxkl_avg": 46.52693176269531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4529426395893097, + "step": 764, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981811046600342 + }, + { + "episode": 12256, + "epoch": 0.22029694071970377, + "loss/policy_avg": -0.04296427220106125, + "lr": 2.8533358895705524e-06, + "objective/entropy": -14.170623779296875, + "objective/kl": 16.591388702392578, + "objective/non_score_reward": -1.6591390371322632, + "objective/rlhf_reward": -8.636556625366211, + "objective/scores": -0.5, + "policy/approxkl_avg": 108.84127807617188, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7804018259048462, + "step": 765, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0000200271606445 + }, + { + "episode": 12272, + "epoch": 0.22058453463709243, + "loss/policy_avg": 0.14860758185386658, + "lr": 2.8531441717791412e-06, + "objective/entropy": 234.34619140625, + "objective/kl": 14.954992294311523, + "objective/non_score_reward": -1.4954993724822998, + "objective/rlhf_reward": -3.8592914364495616, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 31.390544891357422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7560025453567505, + "step": 766, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960116147994995 + }, + { + "episode": 12288, + "epoch": 0.22087212855448107, + "loss/policy_avg": 0.027001656591892242, + "lr": 2.85295245398773e-06, + "objective/entropy": -231.42864990234375, + "objective/kl": 14.624351501464844, + "objective/non_score_reward": -1.462435245513916, + "objective/rlhf_reward": -1.449740996956825, + "objective/scores": 1.1, + "policy/approxkl_avg": 125.69221496582031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6178066730499268, + "step": 767, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983733892440796 + }, + { + "episode": 12304, + "epoch": 0.2211597224718697, + "loss/policy_avg": 0.10003480315208435, + "lr": 2.852760736196319e-06, + "objective/entropy": 105.91529846191406, + "objective/kl": 11.028611183166504, + "objective/non_score_reward": -1.1028611660003662, + "objective/rlhf_reward": -0.011445081233977916, + "objective/scores": 1.1, + "policy/approxkl_avg": 99.16251373291016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4474494457244873, + "step": 768, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001845359802246 + }, + { + "episode": 12320, + "epoch": 0.22144731638925838, + "loss/policy_avg": 0.3084249794483185, + "lr": 2.852569018404908e-06, + "objective/entropy": 311.5811767578125, + "objective/kl": 12.777912139892578, + "objective/non_score_reward": -1.2777912616729736, + "objective/rlhf_reward": -4.711165154725313, + "objective/scores": 0.1, + "policy/approxkl_avg": 87.19984436035156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.747369110584259, + "step": 769, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000682830810547 + }, + { + "episode": 12336, + "epoch": 0.22173491030664702, + "loss/policy_avg": 0.1643168181180954, + "lr": 2.852377300613497e-06, + "objective/entropy": -229.23110961914062, + "objective/kl": 13.623214721679688, + "objective/non_score_reward": -1.362321376800537, + "objective/rlhf_reward": -7.449285507202148, + "objective/scores": -0.5, + "policy/approxkl_avg": 68.4518051147461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7264796495437622, + "step": 770, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0006937980651855 + }, + { + "episode": 12352, + "epoch": 0.22202250422403566, + "loss/policy_avg": -0.010787129402160645, + "lr": 2.8521855828220857e-06, + "objective/entropy": 124.02545166015625, + "objective/kl": 7.704123497009277, + "objective/non_score_reward": -0.7704123258590698, + "objective/rlhf_reward": -5.081649303436279, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.008346557617188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6329702138900757, + "step": 771, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0013375282287598 + }, + { + "episode": 12368, + "epoch": 0.2223100981414243, + "loss/policy_avg": 0.2653355300426483, + "lr": 2.851993865030675e-06, + "objective/entropy": 23.56524658203125, + "objective/kl": 15.354362487792969, + "objective/non_score_reward": -1.5354361534118652, + "objective/rlhf_reward": -5.741744464635849, + "objective/scores": 0.1, + "policy/approxkl_avg": 111.05146789550781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6844107508659363, + "step": 772, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988980293273926 + }, + { + "episode": 12384, + "epoch": 0.22259769205881297, + "loss/policy_avg": 0.08450818061828613, + "lr": 2.8518021472392637e-06, + "objective/entropy": 342.4418029785156, + "objective/kl": 15.424835205078125, + "objective/non_score_reward": -1.5424836874008179, + "objective/rlhf_reward": -5.769934868812561, + "objective/scores": 0.1, + "policy/approxkl_avg": 132.2557373046875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8565220236778259, + "step": 773, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9969991445541382 + }, + { + "episode": 12400, + "epoch": 0.2228852859762016, + "loss/policy_avg": -0.5032927393913269, + "lr": 2.851610429447853e-06, + "objective/entropy": -107.81707763671875, + "objective/kl": 13.79594612121582, + "objective/non_score_reward": -1.3795948028564453, + "objective/rlhf_reward": -3.118379211425781, + "objective/scores": 0.6, + "policy/approxkl_avg": 41.49142837524414, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.35487908124923706, + "step": 774, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.006086826324463 + }, + { + "episode": 12416, + "epoch": 0.22317287989359025, + "loss/policy_avg": 0.14595381915569305, + "lr": 2.8514187116564418e-06, + "objective/entropy": -56.38682556152344, + "objective/kl": 13.150504112243652, + "objective/non_score_reward": -1.3150502443313599, + "objective/rlhf_reward": -3.3127899570035293, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 46.86200714111328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7194310426712036, + "step": 775, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001683235168457 + }, + { + "episode": 12432, + "epoch": 0.2234604738109789, + "loss/policy_avg": -0.2638096213340759, + "lr": 2.8512269938650306e-06, + "objective/entropy": 76.923095703125, + "objective/kl": 12.93875503540039, + "objective/non_score_reward": -1.2938756942749023, + "objective/rlhf_reward": -7.175502777099609, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.019851684570312, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5464330911636353, + "step": 776, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.014585018157959 + }, + { + "episode": 12448, + "epoch": 0.22374806772836756, + "loss/policy_avg": 0.19038286805152893, + "lr": 2.85103527607362e-06, + "objective/entropy": 99.66128540039062, + "objective/kl": 11.428020477294922, + "objective/non_score_reward": -1.142802119255066, + "objective/rlhf_reward": -4.1712084174156185, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.72539520263672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.533401608467102, + "step": 777, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9978110790252686 + }, + { + "episode": 12464, + "epoch": 0.2240356616457562, + "loss/policy_avg": 0.31564778089523315, + "lr": 2.8508435582822086e-06, + "objective/entropy": 43.836891174316406, + "objective/kl": 15.314764022827148, + "objective/non_score_reward": -1.5314764976501465, + "objective/rlhf_reward": -1.7259061098098751, + "objective/scores": 1.1, + "policy/approxkl_avg": 83.3494644165039, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7181559205055237, + "step": 778, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9969323873519897 + }, + { + "episode": 12480, + "epoch": 0.22432325556314484, + "loss/policy_avg": 0.15969273447990417, + "lr": 2.8506518404907974e-06, + "objective/entropy": 62.45077896118164, + "objective/kl": 15.108039855957031, + "objective/non_score_reward": -1.5108040571212769, + "objective/rlhf_reward": -5.643216168880462, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.81277465820312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6362130641937256, + "step": 779, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979162216186523 + }, + { + "episode": 12496, + "epoch": 0.22461084948053348, + "loss/policy_avg": -0.1570049226284027, + "lr": 2.8504601226993867e-06, + "objective/entropy": 130.28684997558594, + "objective/kl": 15.398730278015137, + "objective/non_score_reward": -1.5398731231689453, + "objective/rlhf_reward": -3.759492194652557, + "objective/scores": 0.6, + "policy/approxkl_avg": 36.37596893310547, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5687062740325928, + "step": 780, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0024261474609375 + }, + { + "episode": 12512, + "epoch": 0.22489844339792214, + "loss/policy_avg": 0.1517828404903412, + "lr": 2.8502684049079755e-06, + "objective/entropy": 66.80693817138672, + "objective/kl": 7.023012161254883, + "objective/non_score_reward": -0.7023012042045593, + "objective/rlhf_reward": -1.1473453245764835, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 70.58222961425781, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5671324729919434, + "step": 781, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999403953552246 + }, + { + "episode": 12528, + "epoch": 0.22518603731531078, + "loss/policy_avg": 0.3743288516998291, + "lr": 2.8500766871165647e-06, + "objective/entropy": 37.623748779296875, + "objective/kl": 9.943754196166992, + "objective/non_score_reward": -0.994375467300415, + "objective/rlhf_reward": -1.5775016754865643, + "objective/scores": 0.6, + "policy/approxkl_avg": 14.220436096191406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5091394186019897, + "step": 782, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982104301452637 + }, + { + "episode": 12544, + "epoch": 0.22547363123269942, + "loss/policy_avg": 0.4069502055644989, + "lr": 2.8498849693251535e-06, + "objective/entropy": 184.85443115234375, + "objective/kl": 14.495233535766602, + "objective/non_score_reward": -1.4495233297348022, + "objective/rlhf_reward": -7.798093795776367, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.534117221832275, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6540646553039551, + "step": 783, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999969720840454 + }, + { + "episode": 12560, + "epoch": 0.22576122515008806, + "loss/policy_avg": -0.15900185704231262, + "lr": 2.8496932515337423e-06, + "objective/entropy": -7.934391021728516, + "objective/kl": 14.886871337890625, + "objective/non_score_reward": -1.4886871576309204, + "objective/rlhf_reward": -7.954748630523682, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.033672332763672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6111799478530884, + "step": 784, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002901554107666 + }, + { + "episode": 12576, + "epoch": 0.22604881906747673, + "loss/policy_avg": 0.05465098097920418, + "lr": 2.8495015337423316e-06, + "objective/entropy": 184.02117919921875, + "objective/kl": 11.928532600402832, + "objective/non_score_reward": -1.192853331565857, + "objective/rlhf_reward": -4.371413117647171, + "objective/scores": 0.1, + "policy/approxkl_avg": 45.86432647705078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8463940620422363, + "step": 785, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000133752822876 + }, + { + "episode": 12592, + "epoch": 0.22633641298486537, + "loss/policy_avg": 0.3418072462081909, + "lr": 2.8493098159509204e-06, + "objective/entropy": -23.809371948242188, + "objective/kl": 15.163887023925781, + "objective/non_score_reward": -1.5163884162902832, + "objective/rlhf_reward": -8.065553665161133, + "objective/scores": -0.5, + "policy/approxkl_avg": 78.87910461425781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8011717796325684, + "step": 786, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.99837064743042 + }, + { + "episode": 12608, + "epoch": 0.226624006902254, + "loss/policy_avg": -0.3748244047164917, + "lr": 2.8491180981595096e-06, + "objective/entropy": 126.95550537109375, + "objective/kl": 14.842533111572266, + "objective/non_score_reward": -1.4842532873153687, + "objective/rlhf_reward": -5.537013149261474, + "objective/scores": 0.1, + "policy/approxkl_avg": 80.93215942382812, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6581273674964905, + "step": 787, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006587505340576 + }, + { + "episode": 12624, + "epoch": 0.22691160081964268, + "loss/policy_avg": 0.230320543050766, + "lr": 2.8489263803680984e-06, + "objective/entropy": 95.23011779785156, + "objective/kl": 14.606027603149414, + "objective/non_score_reward": -1.4606029987335205, + "objective/rlhf_reward": -4.238291654650288, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 115.39529418945312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.728987455368042, + "step": 788, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998013973236084 + }, + { + "episode": 12640, + "epoch": 0.22719919473703132, + "loss/policy_avg": -0.3885463774204254, + "lr": 2.8487346625766872e-06, + "objective/entropy": 97.80613708496094, + "objective/kl": 11.48002815246582, + "objective/non_score_reward": -1.1480028629302979, + "objective/rlhf_reward": -6.592011451721191, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.53124237060547, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.594038724899292, + "step": 789, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.010626792907715 + }, + { + "episode": 12656, + "epoch": 0.22748678865441996, + "loss/policy_avg": 0.3055153489112854, + "lr": 2.848542944785276e-06, + "objective/entropy": 199.88526916503906, + "objective/kl": 13.171801567077637, + "objective/non_score_reward": -1.3171800374984741, + "objective/rlhf_reward": -3.6068607918625935, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 85.62678527832031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5295522212982178, + "step": 790, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962729215621948 + }, + { + "episode": 12672, + "epoch": 0.2277743825718086, + "loss/policy_avg": 0.18343961238861084, + "lr": 2.848351226993865e-06, + "objective/entropy": 212.48171997070312, + "objective/kl": 14.552225112915039, + "objective/non_score_reward": -1.4552226066589355, + "objective/rlhf_reward": -7.820890426635742, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.826194763183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7555446028709412, + "step": 791, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981155395507812 + }, + { + "episode": 12688, + "epoch": 0.22806197648919727, + "loss/policy_avg": 1.651806116104126, + "lr": 2.848159509202454e-06, + "objective/entropy": 22.058094024658203, + "objective/kl": 12.990779876708984, + "objective/non_score_reward": -1.2990779876708984, + "objective/rlhf_reward": -4.796312069892883, + "objective/scores": 0.1, + "policy/approxkl_avg": 81.44281005859375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7748013734817505, + "step": 792, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999983549118042 + }, + { + "episode": 12704, + "epoch": 0.2283495704065859, + "loss/policy_avg": 0.020992066711187363, + "lr": 2.847967791411043e-06, + "objective/entropy": 100.20834350585938, + "objective/kl": 20.087814331054688, + "objective/non_score_reward": -2.0087814331054688, + "objective/rlhf_reward": -6.373266016662704, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 137.36184692382812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6344403028488159, + "step": 793, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986518621444702 + }, + { + "episode": 12720, + "epoch": 0.22863716432397455, + "loss/policy_avg": 0.06677938997745514, + "lr": 2.8477760736196317e-06, + "objective/entropy": 7.002399444580078, + "objective/kl": 10.064220428466797, + "objective/non_score_reward": -1.0064222812652588, + "objective/rlhf_reward": -3.62568869292736, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.89893341064453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5471144318580627, + "step": 794, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984047412872314 + }, + { + "episode": 12736, + "epoch": 0.22892475824136319, + "loss/policy_avg": -0.07828734815120697, + "lr": 2.847584355828221e-06, + "objective/entropy": -104.39112854003906, + "objective/kl": 18.24449920654297, + "objective/non_score_reward": -1.824450135231018, + "objective/rlhf_reward": -9.297800064086914, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.5714225769043, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6325054168701172, + "step": 795, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970802068710327 + }, + { + "episode": 12752, + "epoch": 0.22921235215875185, + "loss/policy_avg": 0.28975850343704224, + "lr": 2.8473926380368097e-06, + "objective/entropy": 133.83016967773438, + "objective/kl": 8.593679428100586, + "objective/non_score_reward": -0.8593680262565613, + "objective/rlhf_reward": -5.437472343444824, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.80305004119873, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.729377269744873, + "step": 796, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978275299072266 + }, + { + "episode": 12768, + "epoch": 0.2294999460761405, + "loss/policy_avg": 1.422573208808899, + "lr": 2.847200920245399e-06, + "objective/entropy": 149.96119689941406, + "objective/kl": 16.894641876220703, + "objective/non_score_reward": -1.6894640922546387, + "objective/rlhf_reward": -6.357856726646423, + "objective/scores": 0.1, + "policy/approxkl_avg": 230.54080200195312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8895937204360962, + "step": 797, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0016024112701416 + }, + { + "episode": 12784, + "epoch": 0.22978753999352913, + "loss/policy_avg": 0.19597771763801575, + "lr": 2.8470092024539878e-06, + "objective/entropy": 95.1851577758789, + "objective/kl": 12.648846626281738, + "objective/non_score_reward": -1.2648844718933105, + "objective/rlhf_reward": -7.059537887573242, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.24577522277832, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7796587944030762, + "step": 798, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000627279281616 + }, + { + "episode": 12800, + "epoch": 0.23007513391091777, + "loss/policy_avg": 0.13448825478553772, + "lr": 2.8468174846625766e-06, + "objective/entropy": 102.28286743164062, + "objective/kl": 13.572122573852539, + "objective/non_score_reward": -1.3572125434875488, + "objective/rlhf_reward": -1.0288498461246487, + "objective/scores": 1.1, + "policy/approxkl_avg": 100.61666870117188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6195476055145264, + "step": 799, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980723857879639 + }, + { + "episode": 12816, + "epoch": 0.23036272782830644, + "loss/policy_avg": 0.6187319159507751, + "lr": 2.846625766871166e-06, + "objective/entropy": -39.129539489746094, + "objective/kl": 18.39947509765625, + "objective/non_score_reward": -1.8399477005004883, + "objective/rlhf_reward": -2.9597911596298214, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.12504577636719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5955438613891602, + "step": 800, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001791000366211 + }, + { + "episode": 12832, + "epoch": 0.23065032174569508, + "loss/policy_avg": 0.3273153305053711, + "lr": 2.8464340490797546e-06, + "objective/entropy": 13.417747497558594, + "objective/kl": 7.873808860778809, + "objective/non_score_reward": -0.7873809337615967, + "objective/rlhf_reward": -1.324694927009653, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 17.407360076904297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5497540235519409, + "step": 801, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995551109313965 + }, + { + "episode": 12848, + "epoch": 0.23093791566308372, + "loss/policy_avg": 0.3061869144439697, + "lr": 2.8462423312883434e-06, + "objective/entropy": -106.52912902832031, + "objective/kl": 10.210708618164062, + "objective/non_score_reward": -1.0210708379745483, + "objective/rlhf_reward": -6.084282875061035, + "objective/scores": -0.5, + "policy/approxkl_avg": 96.2752456665039, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4860496520996094, + "step": 802, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9976447820663452 + }, + { + "episode": 12864, + "epoch": 0.23122550958047236, + "loss/policy_avg": 0.1686146855354309, + "lr": 2.8460506134969327e-06, + "objective/entropy": 29.669536590576172, + "objective/kl": 18.03110122680664, + "objective/non_score_reward": -1.803110122680664, + "objective/rlhf_reward": -4.288721655250761, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 38.228782653808594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8174216151237488, + "step": 803, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982223510742188 + }, + { + "episode": 12880, + "epoch": 0.23151310349786103, + "loss/policy_avg": 0.25656819343566895, + "lr": 2.8458588957055215e-06, + "objective/entropy": -39.153350830078125, + "objective/kl": 9.698554039001465, + "objective/non_score_reward": -0.9698554277420044, + "objective/rlhf_reward": 0.5205781698226932, + "objective/scores": 1.1, + "policy/approxkl_avg": 9.279545783996582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4837508201599121, + "step": 804, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997928142547607 + }, + { + "episode": 12896, + "epoch": 0.23180069741524967, + "loss/policy_avg": 0.08962078392505646, + "lr": 2.8456671779141107e-06, + "objective/entropy": 91.72572326660156, + "objective/kl": 9.199845314025879, + "objective/non_score_reward": -0.9199845790863037, + "objective/rlhf_reward": -1.9466050426165262, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 36.79070281982422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7538923025131226, + "step": 805, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967260360717773 + }, + { + "episode": 12912, + "epoch": 0.2320882913326383, + "loss/policy_avg": 0.2165328860282898, + "lr": 2.8454754601226995e-06, + "objective/entropy": 141.5177001953125, + "objective/kl": 16.128833770751953, + "objective/non_score_reward": -1.61288321018219, + "objective/rlhf_reward": -4.328826548830543, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 8.008130073547363, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.76326584815979, + "step": 806, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9970195293426514 + }, + { + "episode": 12928, + "epoch": 0.23237588525002698, + "loss/policy_avg": 0.33014577627182007, + "lr": 2.8452837423312883e-06, + "objective/entropy": -7.563770294189453, + "objective/kl": 15.92538070678711, + "objective/non_score_reward": -1.5925382375717163, + "objective/rlhf_reward": -8.370153427124023, + "objective/scores": -0.5, + "policy/approxkl_avg": 122.01593017578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6476025581359863, + "step": 807, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975717067718506 + }, + { + "episode": 12944, + "epoch": 0.23266347916741562, + "loss/policy_avg": 0.2228621542453766, + "lr": 2.8450920245398776e-06, + "objective/entropy": -146.32801818847656, + "objective/kl": 14.984716415405273, + "objective/non_score_reward": -1.498471736907959, + "objective/rlhf_reward": -7.993886947631836, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.571311950683594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6879932880401611, + "step": 808, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982060194015503 + }, + { + "episode": 12960, + "epoch": 0.23295107308480426, + "loss/policy_avg": -0.1637599915266037, + "lr": 2.8449003067484664e-06, + "objective/entropy": -171.0908203125, + "objective/kl": 4.425614833831787, + "objective/non_score_reward": -0.44256141781806946, + "objective/rlhf_reward": -1.370245734602213, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.86245059967041, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.36712339520454407, + "step": 809, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001098871231079 + }, + { + "episode": 12976, + "epoch": 0.2332386670021929, + "loss/policy_avg": 0.16715390980243683, + "lr": 2.8447085889570556e-06, + "objective/entropy": -126.9478530883789, + "objective/kl": 13.796289443969727, + "objective/non_score_reward": -1.379629135131836, + "objective/rlhf_reward": -5.118516108393669, + "objective/scores": 0.1, + "policy/approxkl_avg": 139.91973876953125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6227424144744873, + "step": 810, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997196197509766 + }, + { + "episode": 12992, + "epoch": 0.23352626091958156, + "loss/policy_avg": 0.8460204601287842, + "lr": 2.8445168711656444e-06, + "objective/entropy": -108.99869537353516, + "objective/kl": 14.032926559448242, + "objective/non_score_reward": -1.4032926559448242, + "objective/rlhf_reward": -3.2131706982851025, + "objective/scores": 0.6, + "policy/approxkl_avg": 148.4869384765625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5060499906539917, + "step": 811, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975128173828125 + }, + { + "episode": 13008, + "epoch": 0.2338138548369702, + "loss/policy_avg": 0.5107466578483582, + "lr": 2.8443251533742332e-06, + "objective/entropy": 26.51531219482422, + "objective/kl": 12.1517333984375, + "objective/non_score_reward": -1.215173363685608, + "objective/rlhf_reward": -2.4606932833790776, + "objective/scores": 0.6, + "policy/approxkl_avg": 44.8768310546875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6984570622444153, + "step": 812, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982036352157593 + }, + { + "episode": 13024, + "epoch": 0.23410144875435884, + "loss/policy_avg": 0.5176931619644165, + "lr": 2.844133435582822e-06, + "objective/entropy": 87.28473663330078, + "objective/kl": 11.600730895996094, + "objective/non_score_reward": -1.160073161125183, + "objective/rlhf_reward": -4.240292406082153, + "objective/scores": 0.1, + "policy/approxkl_avg": 86.09700012207031, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5015227794647217, + "step": 813, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994313716888428 + }, + { + "episode": 13040, + "epoch": 0.23438904267174748, + "loss/policy_avg": 0.2525648772716522, + "lr": 2.843941717791411e-06, + "objective/entropy": -90.09027099609375, + "objective/kl": 17.53409767150879, + "objective/non_score_reward": -1.7534098625183105, + "objective/rlhf_reward": -5.188810522827218, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 133.2302703857422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6975289583206177, + "step": 814, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996279239654541 + }, + { + "episode": 13056, + "epoch": 0.23467663658913615, + "loss/policy_avg": 0.0005577714182436466, + "lr": 2.84375e-06, + "objective/entropy": 87.36531829833984, + "objective/kl": 8.974178314208984, + "objective/non_score_reward": -0.8974178433418274, + "objective/rlhf_reward": -3.189671283960342, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.3939369916915894, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5556995868682861, + "step": 815, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007152557373047 + }, + { + "episode": 13072, + "epoch": 0.2349642305065248, + "loss/policy_avg": -0.10415857285261154, + "lr": 2.843558282208589e-06, + "objective/entropy": 281.04083251953125, + "objective/kl": 10.391765594482422, + "objective/non_score_reward": -1.0391765832901, + "objective/rlhf_reward": -3.7567061990499493, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.441307067871094, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.8066496849060059, + "step": 816, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001943588256836 + }, + { + "episode": 13088, + "epoch": 0.23525182442391343, + "loss/policy_avg": 0.2472541332244873, + "lr": 2.8433665644171777e-06, + "objective/entropy": 124.40581512451172, + "objective/kl": 11.622451782226562, + "objective/non_score_reward": -1.1622451543807983, + "objective/rlhf_reward": -2.248980677127838, + "objective/scores": 0.6, + "policy/approxkl_avg": 6.343780517578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5752452611923218, + "step": 817, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999915599822998 + }, + { + "episode": 13104, + "epoch": 0.23553941834130207, + "loss/policy_avg": 0.2929553687572479, + "lr": 2.843174846625767e-06, + "objective/entropy": -18.095306396484375, + "objective/kl": 10.766685485839844, + "objective/non_score_reward": -1.076668620109558, + "objective/rlhf_reward": -6.306674480438232, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.248212814331055, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.44631391763687134, + "step": 818, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986329078674316 + }, + { + "episode": 13120, + "epoch": 0.23582701225869074, + "loss/policy_avg": 0.2811235189437866, + "lr": 2.8429831288343558e-06, + "objective/entropy": -93.74215698242188, + "objective/kl": 10.5712890625, + "objective/non_score_reward": -1.05712890625, + "objective/rlhf_reward": -3.8285156697034832, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.029077529907227, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6386555433273315, + "step": 819, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0000128746032715 + }, + { + "episode": 13136, + "epoch": 0.23611460617607938, + "loss/policy_avg": 0.321929007768631, + "lr": 2.842791411042945e-06, + "objective/entropy": -222.2208251953125, + "objective/kl": 10.463525772094727, + "objective/non_score_reward": -1.0463526248931885, + "objective/rlhf_reward": -3.7854103505611416, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.59898376464844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7040956616401672, + "step": 820, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0001721382141113 + }, + { + "episode": 13152, + "epoch": 0.23640220009346802, + "loss/policy_avg": 0.2252596765756607, + "lr": 2.842599693251534e-06, + "objective/entropy": -5.484672546386719, + "objective/kl": 7.207294940948486, + "objective/non_score_reward": -0.7207294702529907, + "objective/rlhf_reward": 0.04080113327386714, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 4.542896270751953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5736124515533447, + "step": 821, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999035120010376 + }, + { + "episode": 13168, + "epoch": 0.23668979401085666, + "loss/policy_avg": 0.17349383234977722, + "lr": 2.8424079754601226e-06, + "objective/entropy": 260.90020751953125, + "objective/kl": 13.826977729797363, + "objective/non_score_reward": -1.3826978206634521, + "objective/rlhf_reward": -7.530791282653809, + "objective/scores": -0.5, + "policy/approxkl_avg": 39.94245529174805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8797262907028198, + "step": 822, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994349479675293 + }, + { + "episode": 13184, + "epoch": 0.23697738792824533, + "loss/policy_avg": -0.0034087272360920906, + "lr": 2.842216257668712e-06, + "objective/entropy": -51.63107681274414, + "objective/kl": 10.85478401184082, + "objective/non_score_reward": -1.0854783058166504, + "objective/rlhf_reward": 0.058087015151977894, + "objective/scores": 1.1, + "policy/approxkl_avg": 9.927780151367188, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6021898984909058, + "step": 823, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001553535461426 + }, + { + "episode": 13200, + "epoch": 0.23726498184563397, + "loss/policy_avg": 0.42283856868743896, + "lr": 2.8420245398773006e-06, + "objective/entropy": 200.68020629882812, + "objective/kl": 10.381429672241211, + "objective/non_score_reward": -1.0381429195404053, + "objective/rlhf_reward": -6.152571678161621, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.65203857421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7309496998786926, + "step": 824, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9966254234313965 + }, + { + "episode": 13216, + "epoch": 0.2375525757630226, + "loss/policy_avg": -0.09755183756351471, + "lr": 2.84183282208589e-06, + "objective/entropy": -69.01538848876953, + "objective/kl": 5.6848883628845215, + "objective/non_score_reward": -0.5684888362884521, + "objective/rlhf_reward": 2.1260446399450306, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.0148842334747314, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.44518405199050903, + "step": 825, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0009875297546387 + }, + { + "episode": 13232, + "epoch": 0.23784016968041127, + "loss/policy_avg": -0.25965848565101624, + "lr": 2.8416411042944787e-06, + "objective/entropy": 171.923828125, + "objective/kl": 8.58846664428711, + "objective/non_score_reward": -0.8588467836380005, + "objective/rlhf_reward": -3.035387037694454, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.28413391113281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5065795183181763, + "step": 826, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.007476806640625 + }, + { + "episode": 13248, + "epoch": 0.2381277635977999, + "loss/policy_avg": 0.05290607735514641, + "lr": 2.8414493865030675e-06, + "objective/entropy": 31.982357025146484, + "objective/kl": 14.847391128540039, + "objective/non_score_reward": -1.484739065170288, + "objective/rlhf_reward": -5.538956558704376, + "objective/scores": 0.1, + "policy/approxkl_avg": 134.27171325683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3722934126853943, + "step": 827, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999661922454834 + }, + { + "episode": 13264, + "epoch": 0.23841535751518855, + "loss/policy_avg": 0.2654344141483307, + "lr": 2.8412576687116567e-06, + "objective/entropy": 174.80413818359375, + "objective/kl": 10.663211822509766, + "objective/non_score_reward": -1.0663211345672607, + "objective/rlhf_reward": -3.8652845233678814, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.65331268310547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6540871262550354, + "step": 828, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004587173461914 + }, + { + "episode": 13280, + "epoch": 0.2387029514325772, + "loss/policy_avg": 0.29667001962661743, + "lr": 2.8410659509202455e-06, + "objective/entropy": -67.4645004272461, + "objective/kl": 11.012588500976562, + "objective/non_score_reward": -1.1012588739395142, + "objective/rlhf_reward": -4.005035495758056, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.841941833496094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6608229279518127, + "step": 829, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9974405765533447 + }, + { + "episode": 13296, + "epoch": 0.23899054534996586, + "loss/policy_avg": 0.0870223194360733, + "lr": 2.8408742331288343e-06, + "objective/entropy": 218.6323699951172, + "objective/kl": 13.182598114013672, + "objective/non_score_reward": -1.3182597160339355, + "objective/rlhf_reward": -3.6111795954114063, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 191.98348999023438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8216352462768555, + "step": 830, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983137845993042 + }, + { + "episode": 13312, + "epoch": 0.2392781392673545, + "loss/policy_avg": 0.10631455481052399, + "lr": 2.8406825153374236e-06, + "objective/entropy": -176.3839569091797, + "objective/kl": 10.902888298034668, + "objective/non_score_reward": -1.0902888774871826, + "objective/rlhf_reward": -6.3611555099487305, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.94891929626465, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6645264029502869, + "step": 831, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989513158798218 + }, + { + "episode": 13328, + "epoch": 0.23956573318474314, + "loss/policy_avg": 0.5899176001548767, + "lr": 2.8404907975460124e-06, + "objective/entropy": 200.20742797851562, + "objective/kl": 18.23828125, + "objective/non_score_reward": -1.8238282203674316, + "objective/rlhf_reward": -9.295312881469727, + "objective/scores": -0.5, + "policy/approxkl_avg": 94.45753479003906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5188368558883667, + "step": 832, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002236366271973 + }, + { + "episode": 13344, + "epoch": 0.23985332710213178, + "loss/policy_avg": 0.34776580333709717, + "lr": 2.8402990797546016e-06, + "objective/entropy": 3.6189041137695312, + "objective/kl": 14.347872734069824, + "objective/non_score_reward": -1.4347872734069824, + "objective/rlhf_reward": -5.339149034023285, + "objective/scores": 0.1, + "policy/approxkl_avg": 63.92158889770508, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49274176359176636, + "step": 833, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999204158782959 + }, + { + "episode": 13360, + "epoch": 0.24014092101952045, + "loss/policy_avg": -0.2418670505285263, + "lr": 2.84010736196319e-06, + "objective/entropy": 307.7858581542969, + "objective/kl": 11.471115112304688, + "objective/non_score_reward": -1.1471115350723267, + "objective/rlhf_reward": -4.188446259498596, + "objective/scores": 0.1, + "policy/approxkl_avg": 50.762351989746094, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7961260676383972, + "step": 834, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0257229804992676 + }, + { + "episode": 13376, + "epoch": 0.2404285149369091, + "loss/policy_avg": 0.4314028024673462, + "lr": 2.8399156441717792e-06, + "objective/entropy": 106.43098449707031, + "objective/kl": 12.59414291381836, + "objective/non_score_reward": -1.2594143152236938, + "objective/rlhf_reward": -0.6376570820808407, + "objective/scores": 1.1, + "policy/approxkl_avg": 49.60033416748047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6837184429168701, + "step": 835, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000203847885132 + }, + { + "episode": 13392, + "epoch": 0.24071610885429773, + "loss/policy_avg": -0.4717941880226135, + "lr": 2.839723926380368e-06, + "objective/entropy": 156.29014587402344, + "objective/kl": 13.116241455078125, + "objective/non_score_reward": -1.3116241693496704, + "objective/rlhf_reward": -3.690237252917841, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 150.56942749023438, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6304831504821777, + "step": 836, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002359390258789 + }, + { + "episode": 13408, + "epoch": 0.24100370277168637, + "loss/policy_avg": 0.48564639687538147, + "lr": 2.839532208588957e-06, + "objective/entropy": 25.292335510253906, + "objective/kl": 10.876035690307617, + "objective/non_score_reward": -1.0876035690307617, + "objective/rlhf_reward": -2.403002808766301, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 81.79934692382812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7601375579833984, + "step": 837, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0018410682678223 + }, + { + "episode": 13424, + "epoch": 0.24129129668907504, + "loss/policy_avg": 0.2227717936038971, + "lr": 2.839340490797546e-06, + "objective/entropy": 290.1324157714844, + "objective/kl": 8.968036651611328, + "objective/non_score_reward": -0.896803617477417, + "objective/rlhf_reward": -5.587214469909668, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.772152900695801, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6874889135360718, + "step": 838, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978574514389038 + }, + { + "episode": 13440, + "epoch": 0.24157889060646368, + "loss/policy_avg": 0.378933846950531, + "lr": 2.839148773006135e-06, + "objective/entropy": 83.87255096435547, + "objective/kl": 16.091838836669922, + "objective/non_score_reward": -1.6091837882995605, + "objective/rlhf_reward": -8.436735153198242, + "objective/scores": -0.5, + "policy/approxkl_avg": 171.8421630859375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7673331499099731, + "step": 839, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988526105880737 + }, + { + "episode": 13456, + "epoch": 0.24186648452385232, + "loss/policy_avg": 0.05077691376209259, + "lr": 2.838957055214724e-06, + "objective/entropy": 201.8306884765625, + "objective/kl": 10.442170143127441, + "objective/non_score_reward": -1.0442171096801758, + "objective/rlhf_reward": -6.176868438720703, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.752803802490234, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7712172269821167, + "step": 840, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.1516284942626953 + }, + { + "episode": 13472, + "epoch": 0.24215407844124096, + "loss/policy_avg": 0.6697203516960144, + "lr": 2.838765337423313e-06, + "objective/entropy": -228.6722869873047, + "objective/kl": 12.05379867553711, + "objective/non_score_reward": -1.2053799629211426, + "objective/rlhf_reward": -4.421519672870636, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.305578231811523, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6277028918266296, + "step": 841, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998668909072876 + }, + { + "episode": 13488, + "epoch": 0.24244167235862962, + "loss/policy_avg": -0.059078969061374664, + "lr": 2.8385736196319018e-06, + "objective/entropy": 41.68458557128906, + "objective/kl": 9.514412879943848, + "objective/non_score_reward": -0.9514412879943848, + "objective/rlhf_reward": -3.4057652115821835, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.8853363990783691, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6527504324913025, + "step": 842, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000281810760498 + }, + { + "episode": 13504, + "epoch": 0.24272926627601826, + "loss/policy_avg": 0.03922227397561073, + "lr": 2.838381901840491e-06, + "objective/entropy": -126.3974838256836, + "objective/kl": 15.604471206665039, + "objective/non_score_reward": -1.560447096824646, + "objective/rlhf_reward": -5.841788208484649, + "objective/scores": 0.1, + "policy/approxkl_avg": 42.71725082397461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5101525187492371, + "step": 843, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998733401298523 + }, + { + "episode": 13520, + "epoch": 0.2430168601934069, + "loss/policy_avg": -0.07494255900382996, + "lr": 2.83819018404908e-06, + "objective/entropy": -109.69844055175781, + "objective/kl": 12.058096885681152, + "objective/non_score_reward": -1.2058095932006836, + "objective/rlhf_reward": -4.423238492012024, + "objective/scores": 0.1, + "policy/approxkl_avg": 71.72029876708984, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6301126480102539, + "step": 844, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0005083084106445 + }, + { + "episode": 13536, + "epoch": 0.24330445411079557, + "loss/policy_avg": -0.0021638330072164536, + "lr": 2.8379984662576686e-06, + "objective/entropy": -55.14314651489258, + "objective/kl": 16.980009078979492, + "objective/non_score_reward": -1.6980011463165283, + "objective/rlhf_reward": -6.392004287242889, + "objective/scores": 0.1, + "policy/approxkl_avg": 58.57079315185547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5378575921058655, + "step": 845, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998762607574463 + }, + { + "episode": 13552, + "epoch": 0.2435920480281842, + "loss/policy_avg": 0.18776272237300873, + "lr": 2.837806748466258e-06, + "objective/entropy": 213.31375122070312, + "objective/kl": 13.382487297058105, + "objective/non_score_reward": -1.3382488489151, + "objective/rlhf_reward": -0.9529952764511105, + "objective/scores": 1.1, + "policy/approxkl_avg": 47.32990646362305, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5546428561210632, + "step": 846, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999443769454956 + }, + { + "episode": 13568, + "epoch": 0.24387964194557285, + "loss/policy_avg": -0.10050228238105774, + "lr": 2.8376150306748467e-06, + "objective/entropy": -1.5470504760742188, + "objective/kl": 5.421267509460449, + "objective/non_score_reward": -0.5421267151832581, + "objective/rlhf_reward": -1.768506808578968, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.883844375610352, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.690306544303894, + "step": 847, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0045018196105957 + }, + { + "episode": 13584, + "epoch": 0.2441672358629615, + "loss/policy_avg": 0.14597558975219727, + "lr": 2.837423312883436e-06, + "objective/entropy": -17.603618621826172, + "objective/kl": 11.631009101867676, + "objective/non_score_reward": -1.1631009578704834, + "objective/rlhf_reward": -4.252403473854065, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.45585250854492, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5793694257736206, + "step": 848, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997261643409729 + }, + { + "episode": 13600, + "epoch": 0.24445482978035016, + "loss/policy_avg": 0.3591553866863251, + "lr": 2.8372315950920247e-06, + "objective/entropy": 140.2003173828125, + "objective/kl": 16.13811492919922, + "objective/non_score_reward": -1.613811731338501, + "objective/rlhf_reward": -6.055246709287166, + "objective/scores": 0.1, + "policy/approxkl_avg": 67.97541809082031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7342993021011353, + "step": 849, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992564916610718 + }, + { + "episode": 13616, + "epoch": 0.2447424236977388, + "loss/policy_avg": 0.30371835827827454, + "lr": 2.8370398773006135e-06, + "objective/entropy": 231.95297241210938, + "objective/kl": 17.194393157958984, + "objective/non_score_reward": -1.7194395065307617, + "objective/rlhf_reward": -4.755051466003929, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 10.407341003417969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5657638311386108, + "step": 850, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007481575012207 + }, + { + "episode": 13632, + "epoch": 0.24503001761512744, + "loss/policy_avg": 0.5038444399833679, + "lr": 2.8368481595092027e-06, + "objective/entropy": 60.03744125366211, + "objective/kl": 14.280261993408203, + "objective/non_score_reward": -1.4280261993408203, + "objective/rlhf_reward": -3.764693687634404, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 37.566680908203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611575186252594, + "step": 851, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986273050308228 + }, + { + "episode": 13648, + "epoch": 0.24531761153251608, + "loss/policy_avg": 0.25327011942863464, + "lr": 2.8366564417177915e-06, + "objective/entropy": 80.94923400878906, + "objective/kl": 13.566845893859863, + "objective/non_score_reward": -1.356684684753418, + "objective/rlhf_reward": -1.0267389029264447, + "objective/scores": 1.1, + "policy/approxkl_avg": 30.468521118164062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.743263840675354, + "step": 852, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002877712249756 + }, + { + "episode": 13664, + "epoch": 0.24560520544990475, + "loss/policy_avg": 0.7088375091552734, + "lr": 2.8364647239263804e-06, + "objective/entropy": -19.6234130859375, + "objective/kl": 18.899595260620117, + "objective/non_score_reward": -1.889959454536438, + "objective/rlhf_reward": -5.735009069713662, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 87.53837585449219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6533883213996887, + "step": 853, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973728656768799 + }, + { + "episode": 13680, + "epoch": 0.24589279936729339, + "loss/policy_avg": 0.21842418611049652, + "lr": 2.8362730061349696e-06, + "objective/entropy": -2.3630218505859375, + "objective/kl": 10.437822341918945, + "objective/non_score_reward": -1.043782353401184, + "objective/rlhf_reward": -1.775129473209381, + "objective/scores": 0.6, + "policy/approxkl_avg": 25.123273849487305, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4626666307449341, + "step": 854, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997999906539917 + }, + { + "episode": 13696, + "epoch": 0.24618039328468203, + "loss/policy_avg": 0.30842268466949463, + "lr": 2.8360812883435584e-06, + "objective/entropy": 73.10386657714844, + "objective/kl": 14.02588176727295, + "objective/non_score_reward": -1.402587890625, + "objective/rlhf_reward": -7.6103515625, + "objective/scores": -0.5, + "policy/approxkl_avg": 44.31865692138672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47557830810546875, + "step": 855, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000145435333252 + }, + { + "episode": 13712, + "epoch": 0.24646798720207067, + "loss/policy_avg": 0.1986391693353653, + "lr": 2.835889570552147e-06, + "objective/entropy": 41.59130096435547, + "objective/kl": 11.414254188537598, + "objective/non_score_reward": -1.141425371170044, + "objective/rlhf_reward": -6.565701484680176, + "objective/scores": -0.5, + "policy/approxkl_avg": 57.20241165161133, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5653365850448608, + "step": 856, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001561164855957 + }, + { + "episode": 13728, + "epoch": 0.24675558111945933, + "loss/policy_avg": -0.09086053818464279, + "lr": 2.835697852760736e-06, + "objective/entropy": 129.108154296875, + "objective/kl": 11.600625038146973, + "objective/non_score_reward": -1.160062551498413, + "objective/rlhf_reward": -4.240250265598297, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.625953674316406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8041683435440063, + "step": 857, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001258373260498 + }, + { + "episode": 13744, + "epoch": 0.24704317503684797, + "loss/policy_avg": 0.4292464852333069, + "lr": 2.8355061349693253e-06, + "objective/entropy": -7.69146728515625, + "objective/kl": 8.978507995605469, + "objective/non_score_reward": -0.8978508710861206, + "objective/rlhf_reward": -0.6676843806516852, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 29.219491958618164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.43980222940444946, + "step": 858, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975333213806152 + }, + { + "episode": 13760, + "epoch": 0.2473307689542366, + "loss/policy_avg": -0.1740269660949707, + "lr": 2.835314417177914e-06, + "objective/entropy": 199.1434783935547, + "objective/kl": 10.360536575317383, + "objective/non_score_reward": -1.0360536575317383, + "objective/rlhf_reward": -3.74421471953392, + "objective/scores": 0.1, + "policy/approxkl_avg": 45.653465270996094, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.46243250370025635, + "step": 859, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001521110534668 + }, + { + "episode": 13776, + "epoch": 0.24761836287162525, + "loss/policy_avg": 0.11918877065181732, + "lr": 2.835122699386503e-06, + "objective/entropy": 72.46167755126953, + "objective/kl": 14.164287567138672, + "objective/non_score_reward": -1.416428565979004, + "objective/rlhf_reward": -7.665714263916016, + "objective/scores": -0.5, + "policy/approxkl_avg": 134.235107421875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4980497360229492, + "step": 860, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996706247329712 + }, + { + "episode": 13792, + "epoch": 0.24790595678901392, + "loss/policy_avg": 0.05426352471113205, + "lr": 2.834930981595092e-06, + "objective/entropy": 4.267814636230469, + "objective/kl": 14.633834838867188, + "objective/non_score_reward": -1.463383436203003, + "objective/rlhf_reward": -1.453533565998077, + "objective/scores": 1.1, + "policy/approxkl_avg": 25.47060203552246, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.566750168800354, + "step": 861, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998802900314331 + }, + { + "episode": 13808, + "epoch": 0.24819355070640256, + "loss/policy_avg": 0.5575066804885864, + "lr": 2.834739263803681e-06, + "objective/entropy": 206.50341796875, + "objective/kl": 13.636398315429688, + "objective/non_score_reward": -1.3636398315429688, + "objective/rlhf_reward": -1.0545593261718746, + "objective/scores": 1.1, + "policy/approxkl_avg": 26.75678062438965, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4932931065559387, + "step": 862, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983261823654175 + }, + { + "episode": 13824, + "epoch": 0.2484811446237912, + "loss/policy_avg": 0.23257222771644592, + "lr": 2.83454754601227e-06, + "objective/entropy": -39.31481170654297, + "objective/kl": 14.039112091064453, + "objective/non_score_reward": -1.4039111137390137, + "objective/rlhf_reward": -7.615644454956055, + "objective/scores": -0.5, + "policy/approxkl_avg": 62.90751647949219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.643797755241394, + "step": 863, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9972407817840576 + }, + { + "episode": 13840, + "epoch": 0.24876873854117984, + "loss/policy_avg": 0.02133660763502121, + "lr": 2.834355828220859e-06, + "objective/entropy": -178.11383056640625, + "objective/kl": 11.982830047607422, + "objective/non_score_reward": -1.1982829570770264, + "objective/rlhf_reward": -4.393131679296493, + "objective/scores": 0.1, + "policy/approxkl_avg": 53.20249938964844, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7372720241546631, + "step": 864, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001306533813477 + }, + { + "episode": 13856, + "epoch": 0.2490563324585685, + "loss/policy_avg": 0.11362017691135406, + "lr": 2.8341641104294478e-06, + "objective/entropy": 79.2239990234375, + "objective/kl": 13.957067489624023, + "objective/non_score_reward": -1.3957067728042603, + "objective/rlhf_reward": -5.182827150821685, + "objective/scores": 0.1, + "policy/approxkl_avg": 67.16444396972656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6697244048118591, + "step": 865, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998111367225647 + }, + { + "episode": 13872, + "epoch": 0.24934392637595715, + "loss/policy_avg": -0.15431474149227142, + "lr": 2.833972392638037e-06, + "objective/entropy": 85.38504028320312, + "objective/kl": 12.748974800109863, + "objective/non_score_reward": -1.274897575378418, + "objective/rlhf_reward": -4.699590167403221, + "objective/scores": 0.1, + "policy/approxkl_avg": 95.32827758789062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4564756155014038, + "step": 866, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008811950683594 + }, + { + "episode": 13888, + "epoch": 0.2496315202933458, + "loss/policy_avg": 0.5219398140907288, + "lr": 2.833780674846626e-06, + "objective/entropy": -68.0516128540039, + "objective/kl": 10.130060195922852, + "objective/non_score_reward": -1.0130060911178589, + "objective/rlhf_reward": -3.6520243942737576, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.153987884521484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.576066255569458, + "step": 867, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983216524124146 + }, + { + "episode": 13904, + "epoch": 0.24991911421073446, + "loss/policy_avg": -0.27920252084732056, + "lr": 2.8335889570552146e-06, + "objective/entropy": 37.78301239013672, + "objective/kl": 8.389419555664062, + "objective/non_score_reward": -0.8389419317245483, + "objective/rlhf_reward": -5.355767726898193, + "objective/scores": -0.5, + "policy/approxkl_avg": 48.89885711669922, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.3526889383792877, + "step": 868, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0450096130371094 + }, + { + "episode": 13920, + "epoch": 0.25020670812812307, + "loss/policy_avg": 0.23919862508773804, + "lr": 2.833397239263804e-06, + "objective/entropy": 96.82152557373047, + "objective/kl": 14.15482234954834, + "objective/non_score_reward": -1.415482521057129, + "objective/rlhf_reward": -5.261929965019226, + "objective/scores": 0.1, + "policy/approxkl_avg": 81.41618347167969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7077710628509521, + "step": 869, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002362728118896 + }, + { + "episode": 13936, + "epoch": 0.25049430204551176, + "loss/policy_avg": -0.008724374696612358, + "lr": 2.8332055214723927e-06, + "objective/entropy": 5.678382873535156, + "objective/kl": 8.6428861618042, + "objective/non_score_reward": -0.8642886877059937, + "objective/rlhf_reward": -0.5334355577242103, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 18.883581161499023, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6166142225265503, + "step": 870, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976524114608765 + }, + { + "episode": 13952, + "epoch": 0.2507818959629004, + "loss/policy_avg": 0.5849788188934326, + "lr": 2.833013803680982e-06, + "objective/entropy": 7.5515899658203125, + "objective/kl": 12.30251693725586, + "objective/non_score_reward": -1.2302517890930176, + "objective/rlhf_reward": -3.259147563547479, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 45.453224182128906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8170486688613892, + "step": 871, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998389482498169 + }, + { + "episode": 13968, + "epoch": 0.25106948988028904, + "loss/policy_avg": 1.0887643098831177, + "lr": 2.8328220858895707e-06, + "objective/entropy": 19.560287475585938, + "objective/kl": 9.816727638244629, + "objective/non_score_reward": -0.9816729426383972, + "objective/rlhf_reward": -3.526691591739654, + "objective/scores": 0.1, + "policy/approxkl_avg": 59.10675048828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4642179012298584, + "step": 872, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975433349609375 + }, + { + "episode": 13984, + "epoch": 0.2513570837976777, + "loss/policy_avg": 0.27298757433891296, + "lr": 2.8326303680981595e-06, + "objective/entropy": -61.813533782958984, + "objective/kl": 14.266277313232422, + "objective/non_score_reward": -1.4266278743743896, + "objective/rlhf_reward": -3.3065116763114926, + "objective/scores": 0.6, + "policy/approxkl_avg": 34.11764907836914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49072712659835815, + "step": 873, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989274740219116 + }, + { + "episode": 14000, + "epoch": 0.2516446777150663, + "loss/policy_avg": 0.22000867128372192, + "lr": 2.8324386503067487e-06, + "objective/entropy": -82.48310089111328, + "objective/kl": 12.360732078552246, + "objective/non_score_reward": -1.2360732555389404, + "objective/rlhf_reward": -6.944293022155762, + "objective/scores": -0.5, + "policy/approxkl_avg": 76.64207458496094, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4945850372314453, + "step": 874, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999876856803894 + }, + { + "episode": 14016, + "epoch": 0.25193227163245496, + "loss/policy_avg": 0.2379000037908554, + "lr": 2.8322469325153376e-06, + "objective/entropy": 11.343524932861328, + "objective/kl": 13.053705215454102, + "objective/non_score_reward": -1.305370569229126, + "objective/rlhf_reward": -7.221482276916504, + "objective/scores": -0.5, + "policy/approxkl_avg": 109.89395141601562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5588850975036621, + "step": 875, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999418258666992 + }, + { + "episode": 14032, + "epoch": 0.2522198655498436, + "loss/policy_avg": 0.48280632495880127, + "lr": 2.8320552147239268e-06, + "objective/entropy": 169.83905029296875, + "objective/kl": 12.475770950317383, + "objective/non_score_reward": -1.2475769519805908, + "objective/rlhf_reward": -3.042896787600453, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 133.74017333984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.40729820728302, + "step": 876, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9965707063674927 + }, + { + "episode": 14048, + "epoch": 0.25250745946723224, + "loss/policy_avg": 0.5907744765281677, + "lr": 2.8318634969325156e-06, + "objective/entropy": 206.38848876953125, + "objective/kl": 10.661521911621094, + "objective/non_score_reward": -1.0661522150039673, + "objective/rlhf_reward": -3.8646090537309643, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.813899993896484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.71628737449646, + "step": 877, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990339279174805 + }, + { + "episode": 14064, + "epoch": 0.25279505338462094, + "loss/policy_avg": 0.2785950005054474, + "lr": 2.8316717791411044e-06, + "objective/entropy": 162.6572723388672, + "objective/kl": 12.341489791870117, + "objective/non_score_reward": -1.2341489791870117, + "objective/rlhf_reward": -2.012877260090086, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 85.18682861328125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8230720162391663, + "step": 878, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007762908935547 + }, + { + "episode": 14080, + "epoch": 0.2530826473020096, + "loss/policy_avg": 0.5439757108688354, + "lr": 2.8314800613496932e-06, + "objective/entropy": -29.994464874267578, + "objective/kl": 9.197661399841309, + "objective/non_score_reward": -0.9197661876678467, + "objective/rlhf_reward": -3.279064661264419, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.833585739135742, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.591437816619873, + "step": 879, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997772216796875 + }, + { + "episode": 14096, + "epoch": 0.2533702412193982, + "loss/policy_avg": 0.41080114245414734, + "lr": 2.831288343558282e-06, + "objective/entropy": 239.91580200195312, + "objective/kl": 9.134315490722656, + "objective/non_score_reward": -0.9134315252304077, + "objective/rlhf_reward": -5.653726100921631, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.719074249267578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5987927913665771, + "step": 880, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004446506500244 + }, + { + "episode": 14112, + "epoch": 0.25365783513678686, + "loss/policy_avg": 0.6375956535339355, + "lr": 2.8310966257668713e-06, + "objective/entropy": -158.66209411621094, + "objective/kl": 10.0148344039917, + "objective/non_score_reward": -1.0014833211898804, + "objective/rlhf_reward": -1.082214523793432, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 21.38302230834961, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6316667795181274, + "step": 881, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0006909370422363 + }, + { + "episode": 14128, + "epoch": 0.2539454290541755, + "loss/policy_avg": 0.45256638526916504, + "lr": 2.83090490797546e-06, + "objective/entropy": 207.23558044433594, + "objective/kl": 18.251401901245117, + "objective/non_score_reward": -1.8251402378082275, + "objective/rlhf_reward": -4.90056095123291, + "objective/scores": 0.6, + "policy/approxkl_avg": 26.723499298095703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5192215442657471, + "step": 882, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000243663787842 + }, + { + "episode": 14144, + "epoch": 0.25423302297156414, + "loss/policy_avg": -0.07051656395196915, + "lr": 2.830713190184049e-06, + "objective/entropy": 99.65924072265625, + "objective/kl": 11.526399612426758, + "objective/non_score_reward": -1.1526398658752441, + "objective/rlhf_reward": -4.2105597615242, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.548530101776123, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6270474195480347, + "step": 883, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0014915466308594 + }, + { + "episode": 14160, + "epoch": 0.2545206168889528, + "loss/policy_avg": 0.15961593389511108, + "lr": 2.830521472392638e-06, + "objective/entropy": -13.243667602539062, + "objective/kl": 14.339917182922363, + "objective/non_score_reward": -1.43399178981781, + "objective/rlhf_reward": -1.3359672188758847, + "objective/scores": 1.1, + "policy/approxkl_avg": 100.59516906738281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.570610523223877, + "step": 884, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982887506484985 + }, + { + "episode": 14176, + "epoch": 0.2548082108063415, + "loss/policy_avg": 0.2716251015663147, + "lr": 2.830329754601227e-06, + "objective/entropy": 43.207088470458984, + "objective/kl": 15.224469184875488, + "objective/non_score_reward": -1.522447109222412, + "objective/rlhf_reward": -1.6897883176803585, + "objective/scores": 1.1, + "policy/approxkl_avg": 12.94150161743164, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.45153629779815674, + "step": 885, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001307725906372 + }, + { + "episode": 14192, + "epoch": 0.2550958047237301, + "loss/policy_avg": 0.19296292960643768, + "lr": 2.830138036809816e-06, + "objective/entropy": -110.1358642578125, + "objective/kl": 8.883868217468262, + "objective/non_score_reward": -0.8883869051933289, + "objective/rlhf_reward": -5.553547382354736, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.389582633972168, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.713127613067627, + "step": 886, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0009400844573975 + }, + { + "episode": 14208, + "epoch": 0.25538339864111875, + "loss/policy_avg": -0.08597195148468018, + "lr": 2.829946319018405e-06, + "objective/entropy": 135.94149780273438, + "objective/kl": 12.527217864990234, + "objective/non_score_reward": -1.2527216672897339, + "objective/rlhf_reward": -0.6108868777751919, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.5981717109680176, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6131302714347839, + "step": 887, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0009326934814453 + }, + { + "episode": 14224, + "epoch": 0.2556709925585074, + "loss/policy_avg": 0.06909796595573425, + "lr": 2.8297546012269938e-06, + "objective/entropy": 86.01097869873047, + "objective/kl": 10.155022621154785, + "objective/non_score_reward": -1.0155022144317627, + "objective/rlhf_reward": -1.1382899030458655, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 16.276836395263672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6549547910690308, + "step": 888, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995278120040894 + }, + { + "episode": 14240, + "epoch": 0.25595858647589603, + "loss/policy_avg": 0.1944853961467743, + "lr": 2.829562883435583e-06, + "objective/entropy": 23.008251190185547, + "objective/kl": 10.702659606933594, + "objective/non_score_reward": -1.0702658891677856, + "objective/rlhf_reward": -3.88106365352869, + "objective/scores": 0.1, + "policy/approxkl_avg": 92.89102935791016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.557334840297699, + "step": 889, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0014572143554688 + }, + { + "episode": 14256, + "epoch": 0.2562461803932847, + "loss/policy_avg": 0.18296848237514496, + "lr": 2.829371165644172e-06, + "objective/entropy": 236.90658569335938, + "objective/kl": 17.1822566986084, + "objective/non_score_reward": -1.7182257175445557, + "objective/rlhf_reward": -6.472903227806091, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.678916931152344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6069942116737366, + "step": 890, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001883506774902 + }, + { + "episode": 14272, + "epoch": 0.2565337743106733, + "loss/policy_avg": 0.04921949282288551, + "lr": 2.829179447852761e-06, + "objective/entropy": 108.23074340820312, + "objective/kl": 9.07606315612793, + "objective/non_score_reward": -0.907606303691864, + "objective/rlhf_reward": 0.7695747852325443, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.1709768772125244, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6781247854232788, + "step": 891, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007190704345703 + }, + { + "episode": 14288, + "epoch": 0.25682136822806195, + "loss/policy_avg": 0.263433575630188, + "lr": 2.82898773006135e-06, + "objective/entropy": -11.065872192382812, + "objective/kl": 15.533244132995605, + "objective/non_score_reward": -1.5533244609832764, + "objective/rlhf_reward": -4.090591879860435, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 90.50593566894531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6565124988555908, + "step": 892, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996107816696167 + }, + { + "episode": 14304, + "epoch": 0.25710896214545065, + "loss/policy_avg": 0.03790378198027611, + "lr": 2.8287960122699387e-06, + "objective/entropy": -195.91549682617188, + "objective/kl": 4.9871110916137695, + "objective/non_score_reward": -0.49871110916137695, + "objective/rlhf_reward": -0.047433326916630936, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.2991867065429688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5137837529182434, + "step": 893, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9998875856399536 + }, + { + "episode": 14320, + "epoch": 0.2573965560628393, + "loss/policy_avg": 0.16121526062488556, + "lr": 2.828604294478528e-06, + "objective/entropy": -126.01262664794922, + "objective/kl": 13.482461929321289, + "objective/non_score_reward": -1.3482462167739868, + "objective/rlhf_reward": -3.5681560590592136, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 6.479033470153809, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5413044691085815, + "step": 894, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979188442230225 + }, + { + "episode": 14336, + "epoch": 0.25768414998022793, + "loss/policy_avg": 0.03230078145861626, + "lr": 2.8284125766871167e-06, + "objective/entropy": -3.0018844604492188, + "objective/kl": 11.91611385345459, + "objective/non_score_reward": -1.191611409187317, + "objective/rlhf_reward": -4.366445696353912, + "objective/scores": 0.1, + "policy/approxkl_avg": 78.21747589111328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6705281734466553, + "step": 895, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984080791473389 + }, + { + "episode": 14352, + "epoch": 0.25797174389761657, + "loss/policy_avg": 0.3060767948627472, + "lr": 2.8282208588957055e-06, + "objective/entropy": 61.28870391845703, + "objective/kl": 8.302905082702637, + "objective/non_score_reward": -0.8302905559539795, + "objective/rlhf_reward": -2.921162268519401, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.191162586212158, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.553315281867981, + "step": 896, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997650384902954 + }, + { + "episode": 14368, + "epoch": 0.2582593378150052, + "loss/policy_avg": 0.17983271181583405, + "lr": 2.8280291411042947e-06, + "objective/entropy": -20.331504821777344, + "objective/kl": 17.491222381591797, + "objective/non_score_reward": -1.7491222620010376, + "objective/rlhf_reward": -8.996489524841309, + "objective/scores": -0.5, + "policy/approxkl_avg": 119.10844421386719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5886894464492798, + "step": 897, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989356994628906 + }, + { + "episode": 14384, + "epoch": 0.25854693173239385, + "loss/policy_avg": 0.1394512951374054, + "lr": 2.8278374233128836e-06, + "objective/entropy": 127.6997299194336, + "objective/kl": 15.690263748168945, + "objective/non_score_reward": -1.5690264701843262, + "objective/rlhf_reward": -8.276105880737305, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.133121490478516, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6130183935165405, + "step": 898, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997514247894287 + }, + { + "episode": 14400, + "epoch": 0.2588345256497825, + "loss/policy_avg": 0.2098160684108734, + "lr": 2.827645705521473e-06, + "objective/entropy": -21.335540771484375, + "objective/kl": 10.539255142211914, + "objective/non_score_reward": -1.0539255142211914, + "objective/rlhf_reward": -3.815702205896377, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.069839477539062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7244166731834412, + "step": 899, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994945526123047 + }, + { + "episode": 14416, + "epoch": 0.2591221195671711, + "loss/policy_avg": 0.642552375793457, + "lr": 2.8274539877300616e-06, + "objective/entropy": -27.118850708007812, + "objective/kl": 15.130813598632812, + "objective/non_score_reward": -1.5130811929702759, + "objective/rlhf_reward": -3.652324831485748, + "objective/scores": 0.6, + "policy/approxkl_avg": 107.5416259765625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6859394311904907, + "step": 900, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.00144100189209 + }, + { + "episode": 14432, + "epoch": 0.2594097134845598, + "loss/policy_avg": 0.5634386539459229, + "lr": 2.8272622699386504e-06, + "objective/entropy": -81.20954895019531, + "objective/kl": 9.424870491027832, + "objective/non_score_reward": -0.9424870610237122, + "objective/rlhf_reward": -1.647241981998954, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 161.22662353515625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49134987592697144, + "step": 901, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995436668395996 + }, + { + "episode": 14448, + "epoch": 0.25969730740194846, + "loss/policy_avg": 0.12854906916618347, + "lr": 2.8270705521472392e-06, + "objective/entropy": 49.876365661621094, + "objective/kl": 13.830504417419434, + "objective/non_score_reward": -1.383050560951233, + "objective/rlhf_reward": -7.532202243804932, + "objective/scores": -0.5, + "policy/approxkl_avg": 109.16975402832031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6257975697517395, + "step": 902, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994562864303589 + }, + { + "episode": 14464, + "epoch": 0.2599849013193371, + "loss/policy_avg": -0.2561631202697754, + "lr": 2.826878834355828e-06, + "objective/entropy": -208.84307861328125, + "objective/kl": 16.52663803100586, + "objective/non_score_reward": -1.6526635885238647, + "objective/rlhf_reward": -4.210654458403587, + "objective/scores": 0.6, + "policy/approxkl_avg": 27.665565490722656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7057524919509888, + "step": 903, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998441457748413 + }, + { + "episode": 14480, + "epoch": 0.26027249523672574, + "loss/policy_avg": 0.011652922257781029, + "lr": 2.8266871165644173e-06, + "objective/entropy": 39.806941986083984, + "objective/kl": 8.505756378173828, + "objective/non_score_reward": -0.8505756855010986, + "objective/rlhf_reward": -3.0023026227951046, + "objective/scores": 0.1, + "policy/approxkl_avg": 97.56808471679688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49984896183013916, + "step": 904, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997618198394775 + }, + { + "episode": 14496, + "epoch": 0.2605600891541144, + "loss/policy_avg": 0.3299306035041809, + "lr": 2.826495398773006e-06, + "objective/entropy": 186.02059936523438, + "objective/kl": 12.209084510803223, + "objective/non_score_reward": -1.2209084033966064, + "objective/rlhf_reward": -2.760927232281242, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 42.59535217285156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8104973435401917, + "step": 905, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0011796951293945 + }, + { + "episode": 14512, + "epoch": 0.260847683071503, + "loss/policy_avg": 0.4473887085914612, + "lr": 2.826303680981595e-06, + "objective/entropy": 236.0746307373047, + "objective/kl": 15.661829948425293, + "objective/non_score_reward": -1.5661829710006714, + "objective/rlhf_reward": -8.264732360839844, + "objective/scores": -0.5, + "policy/approxkl_avg": 174.1883087158203, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6012299656867981, + "step": 906, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999688982963562 + }, + { + "episode": 14528, + "epoch": 0.26113527698889166, + "loss/policy_avg": 0.26635417342185974, + "lr": 2.826111963190184e-06, + "objective/entropy": -117.22915649414062, + "objective/kl": 19.672523498535156, + "objective/non_score_reward": -1.967252492904663, + "objective/rlhf_reward": -9.869009971618652, + "objective/scores": -0.5, + "policy/approxkl_avg": 142.4681396484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6905121803283691, + "step": 907, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998607873916626 + }, + { + "episode": 14544, + "epoch": 0.26142287090628036, + "loss/policy_avg": 0.463941752910614, + "lr": 2.825920245398773e-06, + "objective/entropy": 193.2610626220703, + "objective/kl": 11.647161483764648, + "objective/non_score_reward": -1.1647162437438965, + "objective/rlhf_reward": -2.2588648259639736, + "objective/scores": 0.6, + "policy/approxkl_avg": 40.81477355957031, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6915194988250732, + "step": 908, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976000785827637 + }, + { + "episode": 14560, + "epoch": 0.261710464823669, + "loss/policy_avg": 0.4663980007171631, + "lr": 2.825728527607362e-06, + "objective/entropy": -9.778663635253906, + "objective/kl": 11.958295822143555, + "objective/non_score_reward": -1.1958296298980713, + "objective/rlhf_reward": -1.8595997437250344, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 25.867328643798828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.38230326771736145, + "step": 909, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996697902679443 + }, + { + "episode": 14576, + "epoch": 0.26199805874105764, + "loss/policy_avg": 0.21209433674812317, + "lr": 2.825536809815951e-06, + "objective/entropy": 162.04637145996094, + "objective/kl": 15.277240753173828, + "objective/non_score_reward": -1.527724266052246, + "objective/rlhf_reward": -5.710896825790405, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.56328010559082, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7583177089691162, + "step": 910, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000117063522339 + }, + { + "episode": 14592, + "epoch": 0.2622856526584463, + "loss/policy_avg": 0.6738499402999878, + "lr": 2.8253450920245398e-06, + "objective/entropy": 169.78570556640625, + "objective/kl": 12.763500213623047, + "objective/non_score_reward": -1.2763500213623047, + "objective/rlhf_reward": -4.705400294065475, + "objective/scores": 0.1, + "policy/approxkl_avg": 50.89369201660156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.712113082408905, + "step": 911, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986307621002197 + }, + { + "episode": 14608, + "epoch": 0.2625732465758349, + "loss/policy_avg": -0.1670105904340744, + "lr": 2.825153374233129e-06, + "objective/entropy": 129.38778686523438, + "objective/kl": 8.210895538330078, + "objective/non_score_reward": -0.8210896253585815, + "objective/rlhf_reward": -5.284358501434326, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.988738536834717, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5074647665023804, + "step": 912, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003474473953247 + }, + { + "episode": 14624, + "epoch": 0.26286084049322356, + "loss/policy_avg": 0.8385502099990845, + "lr": 2.824961656441718e-06, + "objective/entropy": 2.270915985107422, + "objective/kl": 16.01020622253418, + "objective/non_score_reward": -1.6010206937789917, + "objective/rlhf_reward": -6.004082834720611, + "objective/scores": 0.1, + "policy/approxkl_avg": 122.61514282226562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6589502692222595, + "step": 913, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0001535415649414 + }, + { + "episode": 14640, + "epoch": 0.2631484344106122, + "loss/policy_avg": 0.5200643539428711, + "lr": 2.824769938650307e-06, + "objective/entropy": 41.12443542480469, + "objective/kl": 14.1666259765625, + "objective/non_score_reward": -1.4166628122329712, + "objective/rlhf_reward": -1.2666512787342068, + "objective/scores": 1.1, + "policy/approxkl_avg": 110.20585632324219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7196406126022339, + "step": 914, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971590042114258 + }, + { + "episode": 14656, + "epoch": 0.26343602832800084, + "loss/policy_avg": 0.056996263563632965, + "lr": 2.824578220858896e-06, + "objective/entropy": 147.31983947753906, + "objective/kl": 7.540309906005859, + "objective/non_score_reward": -0.7540310025215149, + "objective/rlhf_reward": -5.0161237716674805, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.796720504760742, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5611803531646729, + "step": 915, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9968600273132324 + }, + { + "episode": 14672, + "epoch": 0.26372362224538953, + "loss/policy_avg": 0.24829980731010437, + "lr": 2.8243865030674847e-06, + "objective/entropy": 31.06182861328125, + "objective/kl": 10.875221252441406, + "objective/non_score_reward": -1.0875221490859985, + "objective/rlhf_reward": -2.227382304445777, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.782899379730225, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.635535717010498, + "step": 916, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9965479373931885 + }, + { + "episode": 14688, + "epoch": 0.2640112161627782, + "loss/policy_avg": 0.06934709846973419, + "lr": 2.824194785276074e-06, + "objective/entropy": 142.6950225830078, + "objective/kl": 6.470335960388184, + "objective/non_score_reward": -0.6470335721969604, + "objective/rlhf_reward": 0.3355845019805703, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.148372650146484, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6092413663864136, + "step": 917, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998875617980957 + }, + { + "episode": 14704, + "epoch": 0.2642988100801668, + "loss/policy_avg": 0.20597806572914124, + "lr": 2.8240030674846627e-06, + "objective/entropy": 272.4873046875, + "objective/kl": 14.78536319732666, + "objective/non_score_reward": -1.4785361289978027, + "objective/rlhf_reward": -7.914144992828369, + "objective/scores": -0.5, + "policy/approxkl_avg": 106.7894287109375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7212300300598145, + "step": 918, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0013232231140137 + }, + { + "episode": 14720, + "epoch": 0.26458640399755545, + "loss/policy_avg": -0.0837036743760109, + "lr": 2.8238113496932515e-06, + "objective/entropy": 48.787288665771484, + "objective/kl": 13.830099105834961, + "objective/non_score_reward": -1.383009910583496, + "objective/rlhf_reward": -7.532039165496826, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.889322280883789, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6313380599021912, + "step": 919, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004570484161377 + }, + { + "episode": 14736, + "epoch": 0.2648739979149441, + "loss/policy_avg": 0.5091476440429688, + "lr": 2.8236196319018408e-06, + "objective/entropy": 169.26397705078125, + "objective/kl": 15.42192554473877, + "objective/non_score_reward": -1.5421926975250244, + "objective/rlhf_reward": -5.768770357966423, + "objective/scores": 0.1, + "policy/approxkl_avg": 256.7361145019531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6787434816360474, + "step": 920, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991068840026855 + }, + { + "episode": 14752, + "epoch": 0.26516159183233273, + "loss/policy_avg": 0.153235524892807, + "lr": 2.8234279141104296e-06, + "objective/entropy": -151.74813842773438, + "objective/kl": 8.891946792602539, + "objective/non_score_reward": -0.8891947269439697, + "objective/rlhf_reward": 0.8432209208607677, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.438941955566406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6721331477165222, + "step": 921, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993393421173096 + }, + { + "episode": 14768, + "epoch": 0.2654491857497214, + "loss/policy_avg": 0.1505753993988037, + "lr": 2.823236196319019e-06, + "objective/entropy": 103.85417175292969, + "objective/kl": 9.798246383666992, + "objective/non_score_reward": -0.9798246026039124, + "objective/rlhf_reward": -2.363039045539453, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 3.5156641006469727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5437833070755005, + "step": 922, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987695217132568 + }, + { + "episode": 14784, + "epoch": 0.26573677966711007, + "loss/policy_avg": 0.031007010489702225, + "lr": 2.823044478527607e-06, + "objective/entropy": -236.375732421875, + "objective/kl": 14.377126693725586, + "objective/non_score_reward": -1.4377126693725586, + "objective/rlhf_reward": -5.350850439071655, + "objective/scores": 0.1, + "policy/approxkl_avg": 107.49105072021484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7668004631996155, + "step": 923, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992296695709229 + }, + { + "episode": 14800, + "epoch": 0.2660243735844987, + "loss/policy_avg": 1.054640769958496, + "lr": 2.8228527607361964e-06, + "objective/entropy": 308.00750732421875, + "objective/kl": 16.157392501831055, + "objective/non_score_reward": -1.615739345550537, + "objective/rlhf_reward": -3.539238218904707, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 54.42896270751953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7293402552604675, + "step": 924, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982367753982544 + }, + { + "episode": 14816, + "epoch": 0.26631196750188735, + "loss/policy_avg": 0.8975625038146973, + "lr": 2.8226610429447852e-06, + "objective/entropy": 24.863666534423828, + "objective/kl": 9.843988418579102, + "objective/non_score_reward": -0.9843988418579102, + "objective/rlhf_reward": -3.5375953972339627, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.406509399414062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.34417903423309326, + "step": 925, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979350566864014 + }, + { + "episode": 14832, + "epoch": 0.266599561419276, + "loss/policy_avg": 0.7326955795288086, + "lr": 2.822469325153374e-06, + "objective/entropy": -177.4724578857422, + "objective/kl": 18.637279510498047, + "objective/non_score_reward": -1.8637280464172363, + "objective/rlhf_reward": -5.850791875187474, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 165.49850463867188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6023860573768616, + "step": 926, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989757537841797 + }, + { + "episode": 14848, + "epoch": 0.2668871553366646, + "loss/policy_avg": 0.18156574666500092, + "lr": 2.8222776073619633e-06, + "objective/entropy": 98.12123107910156, + "objective/kl": 16.134002685546875, + "objective/non_score_reward": -1.6134004592895508, + "objective/rlhf_reward": -8.453601837158203, + "objective/scores": -0.5, + "policy/approxkl_avg": 92.71282958984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4930706322193146, + "step": 927, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999679446220398 + }, + { + "episode": 14864, + "epoch": 0.26717474925405327, + "loss/policy_avg": 0.6215415596961975, + "lr": 2.822085889570552e-06, + "objective/entropy": -99.99117279052734, + "objective/kl": 6.985932350158691, + "objective/non_score_reward": -0.6985931396484375, + "objective/rlhf_reward": 1.6056274041533474, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.1479220390319824, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5162912011146545, + "step": 928, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0008466243743896 + }, + { + "episode": 14880, + "epoch": 0.2674623431714419, + "loss/policy_avg": 0.305203914642334, + "lr": 2.8218941717791413e-06, + "objective/entropy": 52.752532958984375, + "objective/kl": 16.012250900268555, + "objective/non_score_reward": -1.6012248992919922, + "objective/rlhf_reward": -6.0048998355865475, + "objective/scores": 0.1, + "policy/approxkl_avg": 33.087364196777344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8244567513465881, + "step": 929, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9968031644821167 + }, + { + "episode": 14896, + "epoch": 0.26774993708883055, + "loss/policy_avg": 0.3882259726524353, + "lr": 2.82170245398773e-06, + "objective/entropy": 170.99806213378906, + "objective/kl": 11.023755073547363, + "objective/non_score_reward": -1.1023752689361572, + "objective/rlhf_reward": -0.009501358866691234, + "objective/scores": 1.1, + "policy/approxkl_avg": 47.82780456542969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.68819260597229, + "step": 930, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000180721282959 + }, + { + "episode": 14912, + "epoch": 0.26803753100621924, + "loss/policy_avg": 0.3398761451244354, + "lr": 2.821510736196319e-06, + "objective/entropy": -18.394920349121094, + "objective/kl": 10.919998168945312, + "objective/non_score_reward": -1.0919996500015259, + "objective/rlhf_reward": -1.9679986894130705, + "objective/scores": 0.6, + "policy/approxkl_avg": 50.97767639160156, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.736298680305481, + "step": 931, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992265701293945 + }, + { + "episode": 14928, + "epoch": 0.2683251249236079, + "loss/policy_avg": -0.4279845058917999, + "lr": 2.821319018404908e-06, + "objective/entropy": 86.6572494506836, + "objective/kl": 7.83455228805542, + "objective/non_score_reward": -0.7834553718566895, + "objective/rlhf_reward": -1.4719620398884876, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 18.167985916137695, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.49258583784103394, + "step": 932, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.011922836303711 + }, + { + "episode": 14944, + "epoch": 0.2686127188409965, + "loss/policy_avg": 0.23957113921642303, + "lr": 2.821127300613497e-06, + "objective/entropy": 121.09419250488281, + "objective/kl": 11.997109413146973, + "objective/non_score_reward": -1.1997110843658447, + "objective/rlhf_reward": -6.798844337463379, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.186599731445312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5409206748008728, + "step": 933, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.995774269104004 + }, + { + "episode": 14960, + "epoch": 0.26890031275838516, + "loss/policy_avg": 0.44504815340042114, + "lr": 2.8209355828220858e-06, + "objective/entropy": 85.28959655761719, + "objective/kl": 9.80569076538086, + "objective/non_score_reward": -0.9805691242218018, + "objective/rlhf_reward": 0.47772344350814855, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.007376670837402, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6362345218658447, + "step": 934, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998863697052002 + }, + { + "episode": 14976, + "epoch": 0.2691879066757738, + "loss/policy_avg": 0.06488144397735596, + "lr": 2.820743865030675e-06, + "objective/entropy": -71.18653869628906, + "objective/kl": 11.959724426269531, + "objective/non_score_reward": -1.1959723234176636, + "objective/rlhf_reward": -6.783889293670654, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.0035400390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6700109243392944, + "step": 935, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0012855529785156 + }, + { + "episode": 14992, + "epoch": 0.26947550059316244, + "loss/policy_avg": 0.00518820621073246, + "lr": 2.820552147239264e-06, + "objective/entropy": 10.628082275390625, + "objective/kl": 19.038652420043945, + "objective/non_score_reward": -1.9038654565811157, + "objective/rlhf_reward": -4.691742573620054, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 43.91889190673828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9487606287002563, + "step": 936, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997607707977295 + }, + { + "episode": 15008, + "epoch": 0.2697630945105511, + "loss/policy_avg": 0.1454022079706192, + "lr": 2.820360429447853e-06, + "objective/entropy": 179.77040100097656, + "objective/kl": 12.530265808105469, + "objective/non_score_reward": -1.2530266046524048, + "objective/rlhf_reward": -4.612106418609619, + "objective/scores": 0.1, + "policy/approxkl_avg": 82.92576599121094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6685892343521118, + "step": 937, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975636005401611 + }, + { + "episode": 15024, + "epoch": 0.2700506884279397, + "loss/policy_avg": 0.2995688319206238, + "lr": 2.820168711656442e-06, + "objective/entropy": 133.7378692626953, + "objective/kl": 10.791794776916504, + "objective/non_score_reward": -1.0791795253753662, + "objective/rlhf_reward": -1.9167180418968202, + "objective/scores": 0.6, + "policy/approxkl_avg": 52.06595230102539, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4817456007003784, + "step": 938, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000180721282959 + }, + { + "episode": 15040, + "epoch": 0.2703382823453284, + "loss/policy_avg": 0.34660422801971436, + "lr": 2.8199769938650307e-06, + "objective/entropy": 19.058616638183594, + "objective/kl": 14.038864135742188, + "objective/non_score_reward": -1.4038866758346558, + "objective/rlhf_reward": -1.215546733140945, + "objective/scores": 1.1, + "policy/approxkl_avg": 48.796592712402344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41182541847229004, + "step": 939, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997514009475708 + }, + { + "episode": 15056, + "epoch": 0.27062587626271706, + "loss/policy_avg": 0.2968835234642029, + "lr": 2.81978527607362e-06, + "objective/entropy": 76.23985290527344, + "objective/kl": 13.028081893920898, + "objective/non_score_reward": -1.3028082847595215, + "objective/rlhf_reward": -4.811233407258987, + "objective/scores": 0.1, + "policy/approxkl_avg": 115.7067642211914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6411737203598022, + "step": 940, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997145414352417 + }, + { + "episode": 15072, + "epoch": 0.2709134701801057, + "loss/policy_avg": 0.38955211639404297, + "lr": 2.8195935582822087e-06, + "objective/entropy": -10.788116455078125, + "objective/kl": 14.27804946899414, + "objective/non_score_reward": -1.4278050661087036, + "objective/rlhf_reward": -1.3112202048301693, + "objective/scores": 1.1, + "policy/approxkl_avg": 11.869421005249023, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6856441497802734, + "step": 941, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979138374328613 + }, + { + "episode": 15088, + "epoch": 0.27120106409749434, + "loss/policy_avg": -0.04374265670776367, + "lr": 2.819401840490798e-06, + "objective/entropy": 46.83854675292969, + "objective/kl": 8.191838264465332, + "objective/non_score_reward": -0.8191839456558228, + "objective/rlhf_reward": -2.8767357528209683, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.673277854919434, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4365406632423401, + "step": 942, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992234706878662 + }, + { + "episode": 15104, + "epoch": 0.271488658014883, + "loss/policy_avg": 0.019855128601193428, + "lr": 2.8192101226993868e-06, + "objective/entropy": 146.18865966796875, + "objective/kl": 11.691083908081055, + "objective/non_score_reward": -1.169108271598816, + "objective/rlhf_reward": -4.276433093845844, + "objective/scores": 0.1, + "policy/approxkl_avg": 49.19375228881836, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.438481867313385, + "step": 943, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008602142333984 + }, + { + "episode": 15120, + "epoch": 0.2717762519322716, + "loss/policy_avg": -0.07478684931993484, + "lr": 2.8190184049079756e-06, + "objective/entropy": 203.56137084960938, + "objective/kl": 14.393468856811523, + "objective/non_score_reward": -1.4393467903137207, + "objective/rlhf_reward": -1.357387429475784, + "objective/scores": 1.1, + "policy/approxkl_avg": 32.7160758972168, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7971817255020142, + "step": 944, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00042462348938 + }, + { + "episode": 15136, + "epoch": 0.27206384584966026, + "loss/policy_avg": 0.2446022629737854, + "lr": 2.8188266871165644e-06, + "objective/entropy": 31.180438995361328, + "objective/kl": 16.74203109741211, + "objective/non_score_reward": -1.6742032766342163, + "objective/rlhf_reward": -8.696813583374023, + "objective/scores": -0.5, + "policy/approxkl_avg": 105.12753295898438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7867506742477417, + "step": 945, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9979796409606934 + }, + { + "episode": 15152, + "epoch": 0.27235143976704895, + "loss/policy_avg": 0.1629078984260559, + "lr": 2.818634969325153e-06, + "objective/entropy": -197.06622314453125, + "objective/kl": 13.794787406921387, + "objective/non_score_reward": -1.3794788122177124, + "objective/rlhf_reward": -7.51791524887085, + "objective/scores": -0.5, + "policy/approxkl_avg": 115.64588928222656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6810861825942993, + "step": 946, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9969407320022583 + }, + { + "episode": 15168, + "epoch": 0.2726390336844376, + "loss/policy_avg": -0.03024168312549591, + "lr": 2.8184432515337424e-06, + "objective/entropy": 149.4989776611328, + "objective/kl": 16.816232681274414, + "objective/non_score_reward": -1.6816232204437256, + "objective/rlhf_reward": -8.726492881774902, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.17455291748047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48948463797569275, + "step": 947, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001532554626465 + }, + { + "episode": 15184, + "epoch": 0.27292662760182623, + "loss/policy_avg": 0.34567373991012573, + "lr": 2.8182515337423312e-06, + "objective/entropy": 90.28936767578125, + "objective/kl": 19.398473739624023, + "objective/non_score_reward": -1.939847469329834, + "objective/rlhf_reward": -7.359389877319336, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.141029357910156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7335373163223267, + "step": 948, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995067119598389 + }, + { + "episode": 15200, + "epoch": 0.2732142215192149, + "loss/policy_avg": 0.4489978551864624, + "lr": 2.81805981595092e-06, + "objective/entropy": 38.101829528808594, + "objective/kl": 14.68608283996582, + "objective/non_score_reward": -1.4686082601547241, + "objective/rlhf_reward": -7.8744330406188965, + "objective/scores": -0.5, + "policy/approxkl_avg": 115.49784088134766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5973610877990723, + "step": 949, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989502429962158 + }, + { + "episode": 15216, + "epoch": 0.2735018154366035, + "loss/policy_avg": 0.4765210449695587, + "lr": 2.8178680981595093e-06, + "objective/entropy": 234.66183471679688, + "objective/kl": 17.86334228515625, + "objective/non_score_reward": -1.7863342761993408, + "objective/rlhf_reward": -6.745336925983429, + "objective/scores": 0.1, + "policy/approxkl_avg": 46.356414794921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7054615616798401, + "step": 950, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985949993133545 + }, + { + "episode": 15232, + "epoch": 0.27378940935399215, + "loss/policy_avg": 0.0288299061357975, + "lr": 2.817676380368098e-06, + "objective/entropy": 156.8824462890625, + "objective/kl": 9.917464256286621, + "objective/non_score_reward": -0.9917463660240173, + "objective/rlhf_reward": -5.966985702514648, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.51423645019531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5643774271011353, + "step": 951, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000211238861084 + }, + { + "episode": 15248, + "epoch": 0.2740770032713808, + "loss/policy_avg": 0.11630159616470337, + "lr": 2.8174846625766873e-06, + "objective/entropy": 73.06771087646484, + "objective/kl": 13.975484848022461, + "objective/non_score_reward": -1.3975484371185303, + "objective/rlhf_reward": -5.190193688869476, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.41292190551758, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5209547281265259, + "step": 952, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003228187561035 + }, + { + "episode": 15264, + "epoch": 0.27436459718876943, + "loss/policy_avg": 0.4185836911201477, + "lr": 2.817292944785276e-06, + "objective/entropy": 172.16983032226562, + "objective/kl": 18.557403564453125, + "objective/non_score_reward": -1.8557404279708862, + "objective/rlhf_reward": -7.022961831092834, + "objective/scores": 0.1, + "policy/approxkl_avg": 90.51202392578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.751570463180542, + "step": 953, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9965605735778809 + }, + { + "episode": 15280, + "epoch": 0.27465219110615813, + "loss/policy_avg": -0.0021925121545791626, + "lr": 2.817101226993865e-06, + "objective/entropy": -46.05889892578125, + "objective/kl": 13.408881187438965, + "objective/non_score_reward": -1.3408881425857544, + "objective/rlhf_reward": -2.439833794475767, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 17.623706817626953, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7430280447006226, + "step": 954, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0016512870788574 + }, + { + "episode": 15296, + "epoch": 0.27493978502354677, + "loss/policy_avg": -0.014723315834999084, + "lr": 2.816909509202454e-06, + "objective/entropy": 288.378173828125, + "objective/kl": 11.871005058288574, + "objective/non_score_reward": -1.1871004104614258, + "objective/rlhf_reward": -6.748402118682861, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.3176779747009277, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6996443271636963, + "step": 955, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000519037246704 + }, + { + "episode": 15312, + "epoch": 0.2752273789409354, + "loss/policy_avg": 0.6079083681106567, + "lr": 2.816717791411043e-06, + "objective/entropy": 31.44274139404297, + "objective/kl": 20.894535064697266, + "objective/non_score_reward": -2.08945369720459, + "objective/rlhf_reward": -7.957814311981202, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.899011611938477, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3588119149208069, + "step": 956, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980523586273193 + }, + { + "episode": 15328, + "epoch": 0.27551497285832405, + "loss/policy_avg": 0.41200828552246094, + "lr": 2.816526073619632e-06, + "objective/entropy": -4.771537780761719, + "objective/kl": 15.709589004516602, + "objective/non_score_reward": -1.5709590911865234, + "objective/rlhf_reward": -4.161129685417686, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 90.92428588867188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5645814538002014, + "step": 957, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.995368242263794 + }, + { + "episode": 15344, + "epoch": 0.2758025667757127, + "loss/policy_avg": 0.45684581995010376, + "lr": 2.816334355828221e-06, + "objective/entropy": -4.6809844970703125, + "objective/kl": 9.97734260559082, + "objective/non_score_reward": -0.9977341890335083, + "objective/rlhf_reward": -3.5909366667270657, + "objective/scores": 0.1, + "policy/approxkl_avg": 56.411354064941406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6887657642364502, + "step": 958, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002199172973633 + }, + { + "episode": 15360, + "epoch": 0.2760901606931013, + "loss/policy_avg": -0.45410293340682983, + "lr": 2.81614263803681e-06, + "objective/entropy": -121.51223754882812, + "objective/kl": 7.737212657928467, + "objective/non_score_reward": -0.7737212777137756, + "objective/rlhf_reward": -1.2700562730160465, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 17.588157653808594, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4520346224308014, + "step": 959, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.006819725036621 + }, + { + "episode": 15376, + "epoch": 0.27637775461048997, + "loss/policy_avg": 0.30216163396835327, + "lr": 2.815950920245399e-06, + "objective/entropy": 151.243896484375, + "objective/kl": 17.64258575439453, + "objective/non_score_reward": -1.764258623123169, + "objective/rlhf_reward": -4.133315478206846, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 200.840087890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5193691253662109, + "step": 960, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9965643882751465 + }, + { + "episode": 15392, + "epoch": 0.27666534852787866, + "loss/policy_avg": 0.04060244560241699, + "lr": 2.815759202453988e-06, + "objective/entropy": 231.11839294433594, + "objective/kl": 14.362661361694336, + "objective/non_score_reward": -1.4362661838531494, + "objective/rlhf_reward": -5.345064407587051, + "objective/scores": 0.1, + "policy/approxkl_avg": 47.88323974609375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7988565564155579, + "step": 961, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993538856506348 + }, + { + "episode": 15408, + "epoch": 0.2769529424452673, + "loss/policy_avg": 0.3130953907966614, + "lr": 2.8155674846625767e-06, + "objective/entropy": 120.5262680053711, + "objective/kl": 15.489297866821289, + "objective/non_score_reward": -1.5489299297332764, + "objective/rlhf_reward": -5.795719510316848, + "objective/scores": 0.1, + "policy/approxkl_avg": 145.9391326904297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6065146923065186, + "step": 962, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990172386169434 + }, + { + "episode": 15424, + "epoch": 0.27724053636265594, + "loss/policy_avg": 0.4497438967227936, + "lr": 2.815375766871166e-06, + "objective/entropy": 103.47833251953125, + "objective/kl": 18.042163848876953, + "objective/non_score_reward": -1.8042165040969849, + "objective/rlhf_reward": -9.216865539550781, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.347259521484375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5049797296524048, + "step": 963, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0005576610565186 + }, + { + "episode": 15440, + "epoch": 0.2775281302800446, + "loss/policy_avg": 0.35638368129730225, + "lr": 2.8151840490797547e-06, + "objective/entropy": 91.05720520019531, + "objective/kl": 11.770478248596191, + "objective/non_score_reward": -1.177047848701477, + "objective/rlhf_reward": -1.7844724997293677, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 71.71107482910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8685045838356018, + "step": 964, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999760627746582 + }, + { + "episode": 15456, + "epoch": 0.2778157241974332, + "loss/policy_avg": 0.463620662689209, + "lr": 2.814992331288344e-06, + "objective/entropy": 75.50821685791016, + "objective/kl": 18.123964309692383, + "objective/non_score_reward": -1.8123962879180908, + "objective/rlhf_reward": -5.693326204028681, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 125.98820495605469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6453250050544739, + "step": 965, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984214305877686 + }, + { + "episode": 15472, + "epoch": 0.27810331811482186, + "loss/policy_avg": 0.29372507333755493, + "lr": 2.8148006134969328e-06, + "objective/entropy": 249.2400360107422, + "objective/kl": 21.19771385192871, + "objective/non_score_reward": -2.1197714805603027, + "objective/rlhf_reward": -6.079085892438888, + "objective/scores": 0.6, + "policy/approxkl_avg": 248.2655487060547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8577245473861694, + "step": 966, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999601125717163 + }, + { + "episode": 15488, + "epoch": 0.2783909120322105, + "loss/policy_avg": 0.06916552782058716, + "lr": 2.8146088957055216e-06, + "objective/entropy": -68.20138549804688, + "objective/kl": 7.964428901672363, + "objective/non_score_reward": -0.7964429259300232, + "objective/rlhf_reward": -0.7857717037200929, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.7321834564208984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.44984665513038635, + "step": 967, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999240756034851 + }, + { + "episode": 15504, + "epoch": 0.27867850594959914, + "loss/policy_avg": 0.2512040436267853, + "lr": 2.8144171779141104e-06, + "objective/entropy": 11.576576232910156, + "objective/kl": 7.048787593841553, + "objective/non_score_reward": -0.7048788070678711, + "objective/rlhf_reward": -0.9946863904324283, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 0.4532914459705353, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5414634943008423, + "step": 968, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003035545349121 + }, + { + "episode": 15520, + "epoch": 0.27896609986698784, + "loss/policy_avg": 0.6474967002868652, + "lr": 2.814225460122699e-06, + "objective/entropy": -189.89291381835938, + "objective/kl": 6.612698078155518, + "objective/non_score_reward": -0.6612698435783386, + "objective/rlhf_reward": -0.24507925510406492, + "objective/scores": 0.6, + "policy/approxkl_avg": 31.995384216308594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7192500233650208, + "step": 969, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001279830932617 + }, + { + "episode": 15536, + "epoch": 0.2792536937843765, + "loss/policy_avg": 0.03323252499103546, + "lr": 2.8140337423312884e-06, + "objective/entropy": -8.374664306640625, + "objective/kl": 7.630832672119141, + "objective/non_score_reward": -0.7630833387374878, + "objective/rlhf_reward": -0.12861440026876592, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.505527019500732, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7015961408615112, + "step": 970, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000675678253174 + }, + { + "episode": 15552, + "epoch": 0.2795412877017651, + "loss/policy_avg": 0.21652743220329285, + "lr": 2.8138420245398772e-06, + "objective/entropy": 166.86929321289062, + "objective/kl": 10.748126983642578, + "objective/non_score_reward": -1.0748127698898315, + "objective/rlhf_reward": -3.899251019954681, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.194313049316406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4750038683414459, + "step": 971, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973280429840088 + }, + { + "episode": 15568, + "epoch": 0.27982888161915376, + "loss/policy_avg": 0.49266356229782104, + "lr": 2.813650306748466e-06, + "objective/entropy": 118.69342041015625, + "objective/kl": 12.942707061767578, + "objective/non_score_reward": -1.2942707538604736, + "objective/rlhf_reward": -3.5152232996827233, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 94.40614318847656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7927144169807434, + "step": 972, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002317428588867 + }, + { + "episode": 15584, + "epoch": 0.2801164755365424, + "loss/policy_avg": -0.02213054895401001, + "lr": 2.8134585889570553e-06, + "objective/entropy": 187.91323852539062, + "objective/kl": 18.41681671142578, + "objective/non_score_reward": -1.841681718826294, + "objective/rlhf_reward": -9.366726875305176, + "objective/scores": -0.5, + "policy/approxkl_avg": 153.68333435058594, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7450737953186035, + "step": 973, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002140760421753 + }, + { + "episode": 15600, + "epoch": 0.28040406945393104, + "loss/policy_avg": -0.016699761152267456, + "lr": 2.813266871165644e-06, + "objective/entropy": 20.254638671875, + "objective/kl": 5.66358757019043, + "objective/non_score_reward": -0.566358745098114, + "objective/rlhf_reward": -4.265435218811035, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.735843658447266, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.626510739326477, + "step": 974, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.003321886062622 + }, + { + "episode": 15616, + "epoch": 0.2806916633713197, + "loss/policy_avg": 0.5562024712562561, + "lr": 2.8130751533742333e-06, + "objective/entropy": 83.20606994628906, + "objective/kl": 12.872986793518066, + "objective/non_score_reward": -1.2872986793518066, + "objective/rlhf_reward": -7.149194717407227, + "objective/scores": -0.5, + "policy/approxkl_avg": 62.93476486206055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6668212413787842, + "step": 975, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978232383728027 + }, + { + "episode": 15632, + "epoch": 0.2809792572887083, + "loss/policy_avg": 0.11060173809528351, + "lr": 2.812883435582822e-06, + "objective/entropy": -130.44863891601562, + "objective/kl": 12.669355392456055, + "objective/non_score_reward": -1.2669358253479004, + "objective/rlhf_reward": -2.1440238698732585, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 30.233688354492188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5940582752227783, + "step": 976, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00014328956604 + }, + { + "episode": 15648, + "epoch": 0.281266851206097, + "loss/policy_avg": 0.1823975145816803, + "lr": 2.812691717791411e-06, + "objective/entropy": 94.65828704833984, + "objective/kl": 12.634793281555176, + "objective/non_score_reward": -1.2634793519973755, + "objective/rlhf_reward": -2.130198453308317, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 86.40443420410156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5936030149459839, + "step": 977, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006115436553955 + }, + { + "episode": 15664, + "epoch": 0.28155444512348565, + "loss/policy_avg": 0.2329886257648468, + "lr": 2.8125e-06, + "objective/entropy": 214.52359008789062, + "objective/kl": 14.259061813354492, + "objective/non_score_reward": -1.4259061813354492, + "objective/rlhf_reward": -1.3036247253417965, + "objective/scores": 1.1, + "policy/approxkl_avg": 51.973575592041016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7015668153762817, + "step": 978, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960739612579346 + }, + { + "episode": 15680, + "epoch": 0.2818420390408743, + "loss/policy_avg": 0.1662026047706604, + "lr": 2.812308282208589e-06, + "objective/entropy": 140.4540252685547, + "objective/kl": 18.81513786315918, + "objective/non_score_reward": -1.881514072418213, + "objective/rlhf_reward": -5.578645418362553, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 54.464908599853516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5715263485908508, + "step": 979, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991223812103271 + }, + { + "episode": 15696, + "epoch": 0.28212963295826293, + "loss/policy_avg": 0.5212792754173279, + "lr": 2.8121165644171782e-06, + "objective/entropy": 271.5147705078125, + "objective/kl": 11.757984161376953, + "objective/non_score_reward": -1.1757985353469849, + "objective/rlhf_reward": -2.580488087908302, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 99.44505310058594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6800845265388489, + "step": 980, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972641468048096 + }, + { + "episode": 15712, + "epoch": 0.2824172268756516, + "loss/policy_avg": 0.10810688138008118, + "lr": 2.811924846625767e-06, + "objective/entropy": 220.07534790039062, + "objective/kl": 9.518316268920898, + "objective/non_score_reward": -0.9518316984176636, + "objective/rlhf_reward": -0.8836077495825019, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.45026969909668, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6736278533935547, + "step": 981, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0032143592834473 + }, + { + "episode": 15728, + "epoch": 0.2827048207930402, + "loss/policy_avg": 0.1366291642189026, + "lr": 2.811733128834356e-06, + "objective/entropy": -55.597900390625, + "objective/kl": 9.75802230834961, + "objective/non_score_reward": -0.9758022427558899, + "objective/rlhf_reward": -5.9032087326049805, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.722705841064453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4849565923213959, + "step": 982, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987678527832031 + }, + { + "episode": 15744, + "epoch": 0.28299241471042885, + "loss/policy_avg": 0.29598093032836914, + "lr": 2.811541411042945e-06, + "objective/entropy": 65.98103332519531, + "objective/kl": 8.494199752807617, + "objective/non_score_reward": -0.8494198322296143, + "objective/rlhf_reward": 1.002320730686188, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.425743103027344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.38723868131637573, + "step": 983, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991505146026611 + }, + { + "episode": 15760, + "epoch": 0.28328000862781755, + "loss/policy_avg": 0.44190719723701477, + "lr": 2.811349693251534e-06, + "objective/entropy": -63.0709114074707, + "objective/kl": 13.403665542602539, + "objective/non_score_reward": -1.3403666019439697, + "objective/rlhf_reward": -2.437747453094694, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 51.38234329223633, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.630331814289093, + "step": 984, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976906776428223 + }, + { + "episode": 15776, + "epoch": 0.2835676025452062, + "loss/policy_avg": 0.08249235153198242, + "lr": 2.8111579754601227e-06, + "objective/entropy": -40.74168395996094, + "objective/kl": 17.953168869018555, + "objective/non_score_reward": -1.7953169345855713, + "objective/rlhf_reward": -4.7812679469585415, + "objective/scores": 0.6, + "policy/approxkl_avg": 120.02989196777344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5399852991104126, + "step": 985, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002918243408203 + }, + { + "episode": 15792, + "epoch": 0.2838551964625948, + "loss/policy_avg": 0.2005055695772171, + "lr": 2.810966257668712e-06, + "objective/entropy": -191.1624755859375, + "objective/kl": 9.120893478393555, + "objective/non_score_reward": -0.9120894074440002, + "objective/rlhf_reward": -5.648357391357422, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.947835922241211, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.609139084815979, + "step": 986, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988343715667725 + }, + { + "episode": 15808, + "epoch": 0.28414279037998347, + "loss/policy_avg": 0.7648462057113647, + "lr": 2.8107745398773007e-06, + "objective/entropy": 359.6585693359375, + "objective/kl": 18.952003479003906, + "objective/non_score_reward": -1.895200490951538, + "objective/rlhf_reward": -5.458095910326515, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 54.210479736328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8480129837989807, + "step": 987, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999436616897583 + }, + { + "episode": 15824, + "epoch": 0.2844303842973721, + "loss/policy_avg": 0.6691151857376099, + "lr": 2.81058282208589e-06, + "objective/entropy": 73.57808685302734, + "objective/kl": 18.581069946289062, + "objective/non_score_reward": -1.858107089996338, + "objective/rlhf_reward": -5.309722246901069, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 32.08354949951172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4837074875831604, + "step": 988, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994486570358276 + }, + { + "episode": 15840, + "epoch": 0.28471797821476075, + "loss/policy_avg": 0.2605787217617035, + "lr": 2.8103911042944788e-06, + "objective/entropy": -190.95657348632812, + "objective/kl": 11.05725383758545, + "objective/non_score_reward": -1.1057254076004028, + "objective/rlhf_reward": -2.022901570796966, + "objective/scores": 0.6, + "policy/approxkl_avg": 29.785179138183594, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.74052894115448, + "step": 989, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0007266998291016 + }, + { + "episode": 15856, + "epoch": 0.2850055721321494, + "loss/policy_avg": 0.17699474096298218, + "lr": 2.8101993865030676e-06, + "objective/entropy": 202.82345581054688, + "objective/kl": 10.571136474609375, + "objective/non_score_reward": -1.0571134090423584, + "objective/rlhf_reward": -3.8284538000822064, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.342830657958984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5505605340003967, + "step": 990, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986047744750977 + }, + { + "episode": 15872, + "epoch": 0.285293166049538, + "loss/policy_avg": 0.2762322425842285, + "lr": 2.8100076687116564e-06, + "objective/entropy": -215.3885498046875, + "objective/kl": 11.097982406616211, + "objective/non_score_reward": -1.1097981929779053, + "objective/rlhf_reward": -4.039192607998848, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.58441162109375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.554857611656189, + "step": 991, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000056743621826 + }, + { + "episode": 15888, + "epoch": 0.2855807599669267, + "loss/policy_avg": 0.8083657026290894, + "lr": 2.809815950920245e-06, + "objective/entropy": 136.85971069335938, + "objective/kl": 14.077508926391602, + "objective/non_score_reward": -1.4077508449554443, + "objective/rlhf_reward": -1.2310034692287442, + "objective/scores": 1.1, + "policy/approxkl_avg": 50.88481903076172, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6447016596794128, + "step": 992, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999983310699463 + }, + { + "episode": 15904, + "epoch": 0.28586835388431536, + "loss/policy_avg": 0.09076584875583649, + "lr": 2.8096242331288344e-06, + "objective/entropy": -15.63296127319336, + "objective/kl": 17.013904571533203, + "objective/non_score_reward": -1.701390266418457, + "objective/rlhf_reward": -6.405561363697052, + "objective/scores": 0.1, + "policy/approxkl_avg": 94.94831848144531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5415375828742981, + "step": 993, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992749691009521 + }, + { + "episode": 15920, + "epoch": 0.286155947801704, + "loss/policy_avg": 0.30634552240371704, + "lr": 2.8094325153374232e-06, + "objective/entropy": 146.68658447265625, + "objective/kl": 11.60175895690918, + "objective/non_score_reward": -1.1601760387420654, + "objective/rlhf_reward": -6.640704154968262, + "objective/scores": -0.5, + "policy/approxkl_avg": 34.54106521606445, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.577959418296814, + "step": 994, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996502161026001 + }, + { + "episode": 15936, + "epoch": 0.28644354171909264, + "loss/policy_avg": -0.13943040370941162, + "lr": 2.809240797546012e-06, + "objective/entropy": 84.48971557617188, + "objective/kl": 15.685787200927734, + "objective/non_score_reward": -1.5685787200927734, + "objective/rlhf_reward": -4.449486131939004, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 35.03499221801758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4448893070220947, + "step": 995, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0012125968933105 + }, + { + "episode": 15952, + "epoch": 0.2867311356364813, + "loss/policy_avg": 0.5280898213386536, + "lr": 2.8090490797546013e-06, + "objective/entropy": 270.3984375, + "objective/kl": 17.52050018310547, + "objective/non_score_reward": -1.7520501613616943, + "objective/rlhf_reward": -6.608200347423553, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.95287322998047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6844844818115234, + "step": 996, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999497413635254 + }, + { + "episode": 15968, + "epoch": 0.2870187295538699, + "loss/policy_avg": 0.7680552005767822, + "lr": 2.80885736196319e-06, + "objective/entropy": 83.93873596191406, + "objective/kl": 15.629266738891602, + "objective/non_score_reward": -1.5629265308380127, + "objective/rlhf_reward": -5.8517063617706295, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.26701354980469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6365024447441101, + "step": 997, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980711936950684 + }, + { + "episode": 15984, + "epoch": 0.28730632347125856, + "loss/policy_avg": 0.031346116214990616, + "lr": 2.8086656441717793e-06, + "objective/entropy": 55.36757278442383, + "objective/kl": 5.684802055358887, + "objective/non_score_reward": -0.5684801936149597, + "objective/rlhf_reward": -1.873920848965645, + "objective/scores": 0.1, + "policy/approxkl_avg": 0.7184413075447083, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48540106415748596, + "step": 998, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0035548210144043 + }, + { + "episode": 16000, + "epoch": 0.28759391738864726, + "loss/policy_avg": 0.12601952254772186, + "lr": 2.808473926380368e-06, + "objective/entropy": -26.834529876708984, + "objective/kl": 9.361145973205566, + "objective/non_score_reward": -0.9361146688461304, + "objective/rlhf_reward": -0.8207396909010141, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 71.52543640136719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6758915185928345, + "step": 999, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973740577697754 + } + ], + "logging_steps": 500, + "max_steps": 7824, + "num_input_tokens_seen": 0, + "num_train_epochs": 9.0, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": true, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0, + "train_batch_size": null, + "trial_name": null, + "trial_params": null +}