diff --git "a/checkpoint-2600/trainer_state.json" "b/checkpoint-2600/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2600/trainer_state.json" @@ -0,0 +1,46834 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "episode": 41600, + "epoch": 0.7477441852104828, + "eval_steps": 500, + "global_step": 2600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "episode": 16, + "epoch": 0.00028759391738864725, + "loss/policy_avg": -0.014177359640598297, + "lr": 3e-06, + "objective/entropy": 119.65733337402344, + "objective/kl": 15.623376846313477, + "objective/non_score_reward": -1.5623377561569214, + "objective/rlhf_reward": -3.325632084847662, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 472.72821044921875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7515315413475037, + "step": 0, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000829696655273 + }, + { + "episode": 32, + "epoch": 0.0005751878347772945, + "loss/policy_avg": 0.05164449289441109, + "lr": 2.999808282208589e-06, + "objective/entropy": -117.60435485839844, + "objective/kl": 11.686213493347168, + "objective/non_score_reward": -1.168621301651001, + "objective/rlhf_reward": -4.274485094845295, + "objective/scores": 0.1, + "policy/approxkl_avg": 236.72177124023438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6307989358901978, + "step": 1, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973212480545044 + }, + { + "episode": 48, + "epoch": 0.0008627817521659417, + "loss/policy_avg": 0.6165977120399475, + "lr": 2.999616564417178e-06, + "objective/entropy": -116.07769775390625, + "objective/kl": 10.806825637817383, + "objective/non_score_reward": -1.080682635307312, + "objective/rlhf_reward": -3.922730395942926, + "objective/scores": 0.1, + "policy/approxkl_avg": 211.7506103515625, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.726571798324585, + "step": 2, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0005264282226562 + }, + { + "episode": 64, + "epoch": 0.001150375669554589, + "loss/policy_avg": 0.39946672320365906, + "lr": 2.999424846625767e-06, + "objective/entropy": -284.77886962890625, + "objective/kl": 9.179925918579102, + "objective/non_score_reward": -0.9179927110671997, + "objective/rlhf_reward": -3.2719709336757656, + "objective/scores": 0.1, + "policy/approxkl_avg": 172.39312744140625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7219442129135132, + "step": 3, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9991655349731445 + }, + { + "episode": 80, + "epoch": 0.001437969586943236, + "loss/policy_avg": 0.18221884965896606, + "lr": 2.999233128834356e-06, + "objective/entropy": -326.7154541015625, + "objective/kl": 10.727872848510742, + "objective/non_score_reward": -1.0727872848510742, + "objective/rlhf_reward": -1.3674301027667253, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 252.6199188232422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5611602067947388, + "step": 4, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9977035522460938 + }, + { + "episode": 96, + "epoch": 0.0017255635043318834, + "loss/policy_avg": 0.37348473072052, + "lr": 2.999041411042945e-06, + "objective/entropy": -172.4725341796875, + "objective/kl": 9.580272674560547, + "objective/non_score_reward": -0.958027184009552, + "objective/rlhf_reward": -3.43210876584053, + "objective/scores": 0.1, + "policy/approxkl_avg": 233.60519409179688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6069347858428955, + "step": 5, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.004929304122925 + }, + { + "episode": 112, + "epoch": 0.0020131574217205307, + "loss/policy_avg": 0.5359442234039307, + "lr": 2.9988496932515338e-06, + "objective/entropy": 37.751182556152344, + "objective/kl": 8.995965957641602, + "objective/non_score_reward": -0.8995967507362366, + "objective/rlhf_reward": -1.6509756696986513, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 157.80946350097656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.45883142948150635, + "step": 6, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981789588928223 + }, + { + "episode": 128, + "epoch": 0.002300751339109178, + "loss/policy_avg": 0.07628901302814484, + "lr": 2.998657975460123e-06, + "objective/entropy": -271.4947509765625, + "objective/kl": 9.241050720214844, + "objective/non_score_reward": -0.9241052865982056, + "objective/rlhf_reward": 0.7035788685083393, + "objective/scores": 1.1, + "policy/approxkl_avg": 179.53875732421875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6910897493362427, + "step": 7, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9963037967681885 + }, + { + "episode": 144, + "epoch": 0.002588345256497825, + "loss/policy_avg": 0.0354180671274662, + "lr": 2.998466257668712e-06, + "objective/entropy": 209.80404663085938, + "objective/kl": 11.208139419555664, + "objective/non_score_reward": -1.1208139657974243, + "objective/rlhf_reward": -2.8213962368374927, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 217.8009033203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6648087501525879, + "step": 8, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9964534044265747 + }, + { + "episode": 160, + "epoch": 0.002875939173886472, + "loss/policy_avg": 0.24756430089473724, + "lr": 2.9982745398773006e-06, + "objective/entropy": -5.9293365478515625, + "objective/kl": 1.9302005767822266, + "objective/non_score_reward": -0.1930200457572937, + "objective/rlhf_reward": 1.3506260157367849, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 22.118091583251953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6500340700149536, + "step": 9, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.002328395843506 + }, + { + "episode": 176, + "epoch": 0.0031635330912751195, + "loss/policy_avg": 0.22338274121284485, + "lr": 2.99808282208589e-06, + "objective/entropy": -51.18250274658203, + "objective/kl": 4.893694877624512, + "objective/non_score_reward": -0.48936957120895386, + "objective/rlhf_reward": -3.9574780464172363, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.685855865478516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6514552235603333, + "step": 10, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002238750457764 + }, + { + "episode": 192, + "epoch": 0.0034511270086637668, + "loss/policy_avg": 0.07259142398834229, + "lr": 2.9978911042944787e-06, + "objective/entropy": -35.05317306518555, + "objective/kl": 7.698199272155762, + "objective/non_score_reward": -0.769819974899292, + "objective/rlhf_reward": 1.3207200556993488, + "objective/scores": 1.1, + "policy/approxkl_avg": 163.5728759765625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4998992681503296, + "step": 11, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982101917266846 + }, + { + "episode": 208, + "epoch": 0.003738720926052414, + "loss/policy_avg": 0.30875226855278015, + "lr": 2.9976993865030675e-06, + "objective/entropy": 128.93115234375, + "objective/kl": 8.55907154083252, + "objective/non_score_reward": -0.8559072017669678, + "objective/rlhf_reward": -3.0236288517713543, + "objective/scores": 0.1, + "policy/approxkl_avg": 115.09192657470703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6001245975494385, + "step": 12, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99881911277771 + }, + { + "episode": 224, + "epoch": 0.004026314843441061, + "loss/policy_avg": 0.5440771579742432, + "lr": 2.9975076687116563e-06, + "objective/entropy": 194.59161376953125, + "objective/kl": 14.834866523742676, + "objective/non_score_reward": -1.4834866523742676, + "objective/rlhf_reward": -1.5339468330144879, + "objective/scores": 1.1, + "policy/approxkl_avg": 319.5052185058594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.46385329961776733, + "step": 13, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979116916656494 + }, + { + "episode": 240, + "epoch": 0.004313908760829708, + "loss/policy_avg": 0.12125951051712036, + "lr": 2.9973159509202455e-06, + "objective/entropy": 110.74070739746094, + "objective/kl": 6.117404937744141, + "objective/non_score_reward": -0.6117404699325562, + "objective/rlhf_reward": -4.446961879730225, + "objective/scores": -0.5, + "policy/approxkl_avg": 54.780738830566406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6084516048431396, + "step": 14, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986605644226074 + }, + { + "episode": 256, + "epoch": 0.004601502678218356, + "loss/policy_avg": 0.2855263352394104, + "lr": 2.9971242331288343e-06, + "objective/entropy": -80.00950622558594, + "objective/kl": 8.695795059204102, + "objective/non_score_reward": -0.8695796728134155, + "objective/rlhf_reward": -1.3556124068060258, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 134.0033416748047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4744781255722046, + "step": 15, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99568772315979 + }, + { + "episode": 272, + "epoch": 0.004889096595607003, + "loss/policy_avg": 0.34731030464172363, + "lr": 2.996932515337423e-06, + "objective/entropy": -193.7414093017578, + "objective/kl": 6.8158183097839355, + "objective/non_score_reward": -0.6815819144248962, + "objective/rlhf_reward": -0.7789164585637408, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 94.75167083740234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5870293378829956, + "step": 16, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999891996383667 + }, + { + "episode": 288, + "epoch": 0.00517669051299565, + "loss/policy_avg": 0.09466144442558289, + "lr": 2.9967407975460124e-06, + "objective/entropy": 13.335285186767578, + "objective/kl": 7.405551910400391, + "objective/non_score_reward": -0.7405551671981812, + "objective/rlhf_reward": -4.962221145629883, + "objective/scores": -0.5, + "policy/approxkl_avg": 131.025146484375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4989420771598816, + "step": 17, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998366832733154 + }, + { + "episode": 304, + "epoch": 0.0054642844303842975, + "loss/policy_avg": 0.05795682966709137, + "lr": 2.996549079754601e-06, + "objective/entropy": 121.52836608886719, + "objective/kl": 9.26551628112793, + "objective/non_score_reward": -0.9265516996383667, + "objective/rlhf_reward": -3.3062067613005635, + "objective/scores": 0.1, + "policy/approxkl_avg": 123.83927917480469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7211639285087585, + "step": 18, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0007009506225586 + }, + { + "episode": 320, + "epoch": 0.005751878347772944, + "loss/policy_avg": 0.33338093757629395, + "lr": 2.9963573619631904e-06, + "objective/entropy": -9.356884002685547, + "objective/kl": 4.64314079284668, + "objective/non_score_reward": -0.46431419253349304, + "objective/rlhf_reward": -1.4572567533701657, + "objective/scores": 0.1, + "policy/approxkl_avg": 62.679962158203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.545394778251648, + "step": 19, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998852252960205 + }, + { + "episode": 336, + "epoch": 0.006039472265161592, + "loss/policy_avg": 0.1268569827079773, + "lr": 2.9961656441717792e-06, + "objective/entropy": 150.50843811035156, + "objective/kl": 4.864284515380859, + "objective/non_score_reward": -0.48642849922180176, + "objective/rlhf_reward": -0.3894547065168168, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 35.721397399902344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5442019701004028, + "step": 20, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002377986907959 + }, + { + "episode": 352, + "epoch": 0.006327066182550239, + "loss/policy_avg": 0.21250608563423157, + "lr": 2.995973926380368e-06, + "objective/entropy": 80.20552062988281, + "objective/kl": 9.005538940429688, + "objective/non_score_reward": -0.9005540013313293, + "objective/rlhf_reward": -1.8688826049367586, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 145.47787475585938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5761544704437256, + "step": 21, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991693496704102 + }, + { + "episode": 368, + "epoch": 0.006614660099938887, + "loss/policy_avg": 0.049590617418289185, + "lr": 2.9957822085889573e-06, + "objective/entropy": -100.090576171875, + "objective/kl": 12.812070846557617, + "objective/non_score_reward": -1.2812069654464722, + "objective/rlhf_reward": -3.1774167818593337, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 271.628173828125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.636669397354126, + "step": 22, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9962739944458008 + }, + { + "episode": 384, + "epoch": 0.0069022540173275335, + "loss/policy_avg": 0.09350229799747467, + "lr": 2.995590490797546e-06, + "objective/entropy": -59.7061653137207, + "objective/kl": 12.184288024902344, + "objective/non_score_reward": -1.2184288501739502, + "objective/rlhf_reward": -2.751008885280166, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 219.70611572265625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6429183483123779, + "step": 23, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0023770332336426 + }, + { + "episode": 400, + "epoch": 0.00718984793471618, + "loss/policy_avg": 0.05978470295667648, + "lr": 2.995398773006135e-06, + "objective/entropy": 102.0956802368164, + "objective/kl": 4.846743106842041, + "objective/non_score_reward": -0.484674334526062, + "objective/rlhf_reward": 0.9850217357862268, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 55.893699645996094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7100609540939331, + "step": 24, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99696946144104 + }, + { + "episode": 416, + "epoch": 0.007477441852104828, + "loss/policy_avg": 0.047114282846450806, + "lr": 2.995207055214724e-06, + "objective/entropy": -127.81246185302734, + "objective/kl": 9.111923217773438, + "objective/non_score_reward": -0.9111922979354858, + "objective/rlhf_reward": -3.244769042730331, + "objective/scores": 0.1, + "policy/approxkl_avg": 122.56946563720703, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6466059684753418, + "step": 25, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0014986991882324 + }, + { + "episode": 432, + "epoch": 0.007765035769493475, + "loss/policy_avg": 0.7473582029342651, + "lr": 2.995015337423313e-06, + "objective/entropy": 150.059814453125, + "objective/kl": 10.861345291137695, + "objective/non_score_reward": -1.0861345529556274, + "objective/rlhf_reward": -6.34453821182251, + "objective/scores": -0.5, + "policy/approxkl_avg": 136.23809814453125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46276742219924927, + "step": 26, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0018882751464844 + }, + { + "episode": 448, + "epoch": 0.008052629686882123, + "loss/policy_avg": 0.444513201713562, + "lr": 2.994823619631902e-06, + "objective/entropy": -46.63388442993164, + "objective/kl": 8.348082542419434, + "objective/non_score_reward": -0.834808349609375, + "objective/rlhf_reward": -2.939233502745628, + "objective/scores": 0.1, + "policy/approxkl_avg": 137.52908325195312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7123738527297974, + "step": 27, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9967291355133057 + }, + { + "episode": 464, + "epoch": 0.00834022360427077, + "loss/policy_avg": 0.11199073493480682, + "lr": 2.994631901840491e-06, + "objective/entropy": -52.04168701171875, + "objective/kl": 7.455352783203125, + "objective/non_score_reward": -0.7455353736877441, + "objective/rlhf_reward": -0.5821412935853005, + "objective/scores": 0.6, + "policy/approxkl_avg": 92.71267700195312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48107001185417175, + "step": 28, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976396560668945 + }, + { + "episode": 480, + "epoch": 0.008627817521659416, + "loss/policy_avg": 0.11989644169807434, + "lr": 2.9944401840490798e-06, + "objective/entropy": -23.16903305053711, + "objective/kl": 6.455074310302734, + "objective/non_score_reward": -0.6455073356628418, + "objective/rlhf_reward": 1.8179705530405048, + "objective/scores": 1.1, + "policy/approxkl_avg": 83.73835754394531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.43547505140304565, + "step": 29, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984638690948486 + }, + { + "episode": 496, + "epoch": 0.008915411439048063, + "loss/policy_avg": -0.034143999218940735, + "lr": 2.994248466257669e-06, + "objective/entropy": -80.7169418334961, + "objective/kl": 6.607659339904785, + "objective/non_score_reward": -0.6607659459114075, + "objective/rlhf_reward": -0.2430639252066611, + "objective/scores": 0.6, + "policy/approxkl_avg": 48.386138916015625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6621455550193787, + "step": 30, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0041260719299316 + }, + { + "episode": 512, + "epoch": 0.009203005356436712, + "loss/policy_avg": 0.3250073790550232, + "lr": 2.994056748466258e-06, + "objective/entropy": 30.010330200195312, + "objective/kl": 8.299867630004883, + "objective/non_score_reward": -0.8299866914749146, + "objective/rlhf_reward": -0.39622787082311783, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 136.398681640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5553452372550964, + "step": 31, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9970982074737549 + }, + { + "episode": 528, + "epoch": 0.009490599273825359, + "loss/policy_avg": 0.128182053565979, + "lr": 2.9938650306748466e-06, + "objective/entropy": -206.34194946289062, + "objective/kl": 2.580972671508789, + "objective/non_score_reward": -0.25809726119041443, + "objective/rlhf_reward": 1.89132993972185, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 25.516483306884766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4866250157356262, + "step": 32, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0006601810455322 + }, + { + "episode": 544, + "epoch": 0.009778193191214006, + "loss/policy_avg": 0.14243654906749725, + "lr": 2.993673312883436e-06, + "objective/entropy": 26.605953216552734, + "objective/kl": 4.795662879943848, + "objective/non_score_reward": -0.4795662462711334, + "objective/rlhf_reward": 0.029146325810019302, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 41.724029541015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3814541697502136, + "step": 33, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984464645385742 + }, + { + "episode": 560, + "epoch": 0.010065787108602653, + "loss/policy_avg": 0.0956430584192276, + "lr": 2.9934815950920243e-06, + "objective/entropy": -151.531982421875, + "objective/kl": 8.481374740600586, + "objective/non_score_reward": -0.8481374979019165, + "objective/rlhf_reward": -2.9925499673932787, + "objective/scores": 0.1, + "policy/approxkl_avg": 161.83607482910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7393007278442383, + "step": 34, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999760627746582 + }, + { + "episode": 576, + "epoch": 0.0103533810259913, + "loss/policy_avg": 0.20686647295951843, + "lr": 2.9932898773006135e-06, + "objective/entropy": -46.547210693359375, + "objective/kl": 15.390876770019531, + "objective/non_score_reward": -1.5390875339508057, + "objective/rlhf_reward": -3.232631032110426, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 324.61724853515625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.49993449449539185, + "step": 35, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998364448547363 + }, + { + "episode": 592, + "epoch": 0.010640974943379948, + "loss/policy_avg": 0.14229583740234375, + "lr": 2.9930981595092023e-06, + "objective/entropy": 53.75727462768555, + "objective/kl": 9.625295639038086, + "objective/non_score_reward": -0.9625297784805298, + "objective/rlhf_reward": 0.5498811393976215, + "objective/scores": 1.1, + "policy/approxkl_avg": 131.6404266357422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7562763690948486, + "step": 36, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986224174499512 + }, + { + "episode": 608, + "epoch": 0.010928568860768595, + "loss/policy_avg": 0.09236406534910202, + "lr": 2.9929064417177915e-06, + "objective/entropy": 13.225410461425781, + "objective/kl": 6.162755966186523, + "objective/non_score_reward": -0.6162755489349365, + "objective/rlhf_reward": -2.065102344751358, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.094545364379883, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6474248170852661, + "step": 37, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988819360733032 + }, + { + "episode": 624, + "epoch": 0.011216162778157242, + "loss/policy_avg": 0.3340108394622803, + "lr": 2.9927147239263803e-06, + "objective/entropy": 62.37703323364258, + "objective/kl": 4.8724799156188965, + "objective/non_score_reward": -0.4872480034828186, + "objective/rlhf_reward": -1.5489919766783713, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.97528076171875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3959454894065857, + "step": 38, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985570907592773 + }, + { + "episode": 640, + "epoch": 0.011503756695545889, + "loss/policy_avg": 0.028094250708818436, + "lr": 2.992523006134969e-06, + "objective/entropy": -5.561492919921875, + "objective/kl": 11.000988006591797, + "objective/non_score_reward": -1.100098967552185, + "objective/rlhf_reward": -4.000396034121513, + "objective/scores": 0.1, + "policy/approxkl_avg": 196.45921325683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7062462568283081, + "step": 39, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003294944763184 + }, + { + "episode": 656, + "epoch": 0.011791350612934537, + "loss/policy_avg": 0.08169247210025787, + "lr": 2.9923312883435584e-06, + "objective/entropy": -130.83839416503906, + "objective/kl": 8.569768905639648, + "objective/non_score_reward": -0.8569770455360413, + "objective/rlhf_reward": 0.9720918476581577, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.26432037353516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7602853775024414, + "step": 40, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99922776222229 + }, + { + "episode": 672, + "epoch": 0.012078944530323184, + "loss/policy_avg": 0.0993257462978363, + "lr": 2.992139570552147e-06, + "objective/entropy": 204.11431884765625, + "objective/kl": 5.267461776733398, + "objective/non_score_reward": -0.5267462134361267, + "objective/rlhf_reward": -1.7069848686456681, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.98307800292969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5099983215332031, + "step": 41, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977500438690186 + }, + { + "episode": 688, + "epoch": 0.012366538447711831, + "loss/policy_avg": 0.14465492963790894, + "lr": 2.9919478527607364e-06, + "objective/entropy": 65.64169311523438, + "objective/kl": 8.029642105102539, + "objective/non_score_reward": -0.8029642701148987, + "objective/rlhf_reward": 1.1881429269909862, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.06425476074219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4883229732513428, + "step": 42, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979116916656494 + }, + { + "episode": 704, + "epoch": 0.012654132365100478, + "loss/policy_avg": 0.3832010328769684, + "lr": 2.9917561349693252e-06, + "objective/entropy": -72.26643371582031, + "objective/kl": 8.761590957641602, + "objective/non_score_reward": -0.876159131526947, + "objective/rlhf_reward": -5.504636764526367, + "objective/scores": -0.5, + "policy/approxkl_avg": 114.79124450683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7940360903739929, + "step": 43, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983117580413818 + }, + { + "episode": 720, + "epoch": 0.012941726282489125, + "loss/policy_avg": 0.1457168310880661, + "lr": 2.991564417177914e-06, + "objective/entropy": 215.26284790039062, + "objective/kl": 8.928382873535156, + "objective/non_score_reward": -0.8928384184837341, + "objective/rlhf_reward": 0.8286463558673862, + "objective/scores": 1.1, + "policy/approxkl_avg": 73.42234802246094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5377756953239441, + "step": 44, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967031478881836 + }, + { + "episode": 736, + "epoch": 0.013229320199877773, + "loss/policy_avg": 0.5347464680671692, + "lr": 2.9913726993865033e-06, + "objective/entropy": -0.6218109130859375, + "objective/kl": 10.952564239501953, + "objective/non_score_reward": -1.0952564477920532, + "objective/rlhf_reward": -1.98102588057518, + "objective/scores": 0.6, + "policy/approxkl_avg": 208.24761962890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8326528072357178, + "step": 45, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986271858215332 + }, + { + "episode": 752, + "epoch": 0.01351691411726642, + "loss/policy_avg": 0.08795450627803802, + "lr": 2.991180981595092e-06, + "objective/entropy": 53.97735595703125, + "objective/kl": 9.161317825317383, + "objective/non_score_reward": -0.9161317348480225, + "objective/rlhf_reward": -3.2645270287990567, + "objective/scores": 0.1, + "policy/approxkl_avg": 139.39212036132812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3978268504142761, + "step": 46, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986640214920044 + }, + { + "episode": 768, + "epoch": 0.013804508034655067, + "loss/policy_avg": 0.22336724400520325, + "lr": 2.990989263803681e-06, + "objective/entropy": 95.49320983886719, + "objective/kl": 6.099149703979492, + "objective/non_score_reward": -0.6099148988723755, + "objective/rlhf_reward": 0.4840593293893609, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 63.69355773925781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6312753558158875, + "step": 47, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990160465240479 + }, + { + "episode": 784, + "epoch": 0.014092101952043714, + "loss/policy_avg": -0.4834544062614441, + "lr": 2.99079754601227e-06, + "objective/entropy": 103.409912109375, + "objective/kl": 8.192754745483398, + "objective/non_score_reward": -0.8192753195762634, + "objective/rlhf_reward": -0.8771014422178267, + "objective/scores": 0.6, + "policy/approxkl_avg": 209.68890380859375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6371721029281616, + "step": 48, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002427101135254 + }, + { + "episode": 800, + "epoch": 0.01437969586943236, + "loss/policy_avg": 0.0012040697038173676, + "lr": 2.990605828220859e-06, + "objective/entropy": 278.3375244140625, + "objective/kl": 15.085844039916992, + "objective/non_score_reward": -1.5085842609405518, + "objective/rlhf_reward": -5.6343372169882056, + "objective/scores": 0.1, + "policy/approxkl_avg": 569.4739379882812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8680420517921448, + "step": 49, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974499940872192 + }, + { + "episode": 816, + "epoch": 0.01466728978682101, + "loss/policy_avg": 0.08105640113353729, + "lr": 2.990414110429448e-06, + "objective/entropy": 82.5201416015625, + "objective/kl": 14.61972713470459, + "objective/non_score_reward": -1.461972713470459, + "objective/rlhf_reward": -7.847890853881836, + "objective/scores": -0.5, + "policy/approxkl_avg": 473.65753173828125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8534562587738037, + "step": 50, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0009145736694336 + }, + { + "episode": 832, + "epoch": 0.014954883704209656, + "loss/policy_avg": 0.40666699409484863, + "lr": 2.990222392638037e-06, + "objective/entropy": -77.4280014038086, + "objective/kl": 7.943630218505859, + "objective/non_score_reward": -0.7943630814552307, + "objective/rlhf_reward": -0.2537333711397376, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 91.87364196777344, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6725068092346191, + "step": 51, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985764026641846 + }, + { + "episode": 848, + "epoch": 0.015242477621598303, + "loss/policy_avg": 0.37404656410217285, + "lr": 2.990030674846626e-06, + "objective/entropy": 48.54077911376953, + "objective/kl": 11.823626518249512, + "objective/non_score_reward": -1.1823625564575195, + "objective/rlhf_reward": -6.729450225830078, + "objective/scores": -0.5, + "policy/approxkl_avg": 223.62557983398438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7328237295150757, + "step": 52, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987246990203857 + }, + { + "episode": 864, + "epoch": 0.01553007153898695, + "loss/policy_avg": 0.5382946729660034, + "lr": 2.989838957055215e-06, + "objective/entropy": -190.2376708984375, + "objective/kl": 9.007357597351074, + "objective/non_score_reward": -0.9007357954978943, + "objective/rlhf_reward": -0.6792241677057472, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 168.06661987304688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4519304931163788, + "step": 53, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998849868774414 + }, + { + "episode": 880, + "epoch": 0.0158176654563756, + "loss/policy_avg": 0.4903010427951813, + "lr": 2.989647239263804e-06, + "objective/entropy": 7.207241058349609, + "objective/kl": 4.414880275726318, + "objective/non_score_reward": -0.44148799777030945, + "objective/rlhf_reward": 2.634048038721085, + "objective/scores": 1.1, + "policy/approxkl_avg": 44.266204833984375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.747165858745575, + "step": 54, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998467206954956 + }, + { + "episode": 896, + "epoch": 0.016105259373764245, + "loss/policy_avg": 0.009914087131619453, + "lr": 2.989455521472393e-06, + "objective/entropy": -35.93499755859375, + "objective/kl": 8.610208511352539, + "objective/non_score_reward": -0.8610208630561829, + "objective/rlhf_reward": -1.6192545398798694, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 146.18605041503906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6474767923355103, + "step": 55, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002448558807373 + }, + { + "episode": 912, + "epoch": 0.016392853291152892, + "loss/policy_avg": 0.009470928460359573, + "lr": 2.9892638036809815e-06, + "objective/entropy": -125.8683853149414, + "objective/kl": 9.624561309814453, + "objective/non_score_reward": -0.9624560475349426, + "objective/rlhf_reward": -0.9261052950632301, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 156.87704467773438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4992007613182068, + "step": 56, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993857145309448 + }, + { + "episode": 928, + "epoch": 0.01668044720854154, + "loss/policy_avg": 0.30396610498428345, + "lr": 2.9890720858895707e-06, + "objective/entropy": 47.94700622558594, + "objective/kl": 8.891968727111816, + "objective/non_score_reward": -0.8891968727111816, + "objective/rlhf_reward": -5.556787490844727, + "objective/scores": -0.5, + "policy/approxkl_avg": 125.919189453125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7451653480529785, + "step": 57, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991803169250488 + }, + { + "episode": 944, + "epoch": 0.016968041125930186, + "loss/policy_avg": -0.1904684454202652, + "lr": 2.9888803680981595e-06, + "objective/entropy": 228.5303192138672, + "objective/kl": 4.137008190155029, + "objective/non_score_reward": -0.4137008786201477, + "objective/rlhf_reward": -1.25480350703001, + "objective/scores": 0.1, + "policy/approxkl_avg": 51.00769805908203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6238487958908081, + "step": 58, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0140719413757324 + }, + { + "episode": 960, + "epoch": 0.017255635043318833, + "loss/policy_avg": 0.8186465501785278, + "lr": 2.9886886503067483e-06, + "objective/entropy": -20.18294906616211, + "objective/kl": 10.374330520629883, + "objective/non_score_reward": -1.03743314743042, + "objective/rlhf_reward": -1.7497324258089064, + "objective/scores": 0.6, + "policy/approxkl_avg": 178.15145874023438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5619127750396729, + "step": 59, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0021634101867676 + }, + { + "episode": 976, + "epoch": 0.01754322896070748, + "loss/policy_avg": 0.45171883702278137, + "lr": 2.9884969325153375e-06, + "objective/entropy": 44.993682861328125, + "objective/kl": 7.884735584259033, + "objective/non_score_reward": -0.78847336769104, + "objective/rlhf_reward": -2.75389347076416, + "objective/scores": 0.1, + "policy/approxkl_avg": 131.51107788085938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5738007426261902, + "step": 60, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988288879394531 + }, + { + "episode": 992, + "epoch": 0.017830822878096127, + "loss/policy_avg": 0.697486162185669, + "lr": 2.9883052147239263e-06, + "objective/entropy": 26.72112274169922, + "objective/kl": 7.41924524307251, + "objective/non_score_reward": -0.7419244647026062, + "objective/rlhf_reward": -0.5676979482173918, + "objective/scores": 0.6, + "policy/approxkl_avg": 87.49612426757812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.673904538154602, + "step": 61, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979908466339111 + }, + { + "episode": 1008, + "epoch": 0.018118416795484777, + "loss/policy_avg": 0.08780250698328018, + "lr": 2.988113496932515e-06, + "objective/entropy": 144.9136962890625, + "objective/kl": 7.059360504150391, + "objective/non_score_reward": -0.705936074256897, + "objective/rlhf_reward": 0.09997491097333766, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 81.5174560546875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7397779226303101, + "step": 62, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999450922012329 + }, + { + "episode": 1024, + "epoch": 0.018406010712873424, + "loss/policy_avg": 0.6007635593414307, + "lr": 2.9879217791411044e-06, + "objective/entropy": 116.3339614868164, + "objective/kl": 7.176075458526611, + "objective/non_score_reward": -0.7176075577735901, + "objective/rlhf_reward": -2.470430406183004, + "objective/scores": 0.1, + "policy/approxkl_avg": 73.61609649658203, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4787016808986664, + "step": 63, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989898204803467 + }, + { + "episode": 1040, + "epoch": 0.01869360463026207, + "loss/policy_avg": 0.14422942698001862, + "lr": 2.987730061349693e-06, + "objective/entropy": 124.52241516113281, + "objective/kl": 9.423843383789062, + "objective/non_score_reward": -0.9423844218254089, + "objective/rlhf_reward": -0.8458185240041939, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 138.37496948242188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8451459407806396, + "step": 64, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998455286026001 + }, + { + "episode": 1056, + "epoch": 0.018981198547650718, + "loss/policy_avg": 0.38644856214523315, + "lr": 2.9875383435582824e-06, + "objective/entropy": -148.96302795410156, + "objective/kl": 6.528387546539307, + "objective/non_score_reward": -0.6528387665748596, + "objective/rlhf_reward": -0.4886488600828983, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 81.52159881591797, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.643337607383728, + "step": 65, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999079942703247 + }, + { + "episode": 1072, + "epoch": 0.019268792465039365, + "loss/policy_avg": -0.05883178487420082, + "lr": 2.9873466257668712e-06, + "objective/entropy": -121.89275360107422, + "objective/kl": 7.966899871826172, + "objective/non_score_reward": -0.7966899871826172, + "objective/rlhf_reward": -2.786759978532791, + "objective/scores": 0.1, + "policy/approxkl_avg": 141.0238037109375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6699905395507812, + "step": 66, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981606006622314 + }, + { + "episode": 1088, + "epoch": 0.01955638638242801, + "loss/policy_avg": 0.21825438737869263, + "lr": 2.98715490797546e-06, + "objective/entropy": 17.15899658203125, + "objective/kl": 11.302406311035156, + "objective/non_score_reward": -1.130240559577942, + "objective/rlhf_reward": -6.520962715148926, + "objective/scores": -0.5, + "policy/approxkl_avg": 198.47238159179688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7949972748756409, + "step": 67, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980106353759766 + }, + { + "episode": 1104, + "epoch": 0.019843980299816658, + "loss/policy_avg": 0.2142024040222168, + "lr": 2.9869631901840493e-06, + "objective/entropy": -47.186798095703125, + "objective/kl": 10.205244064331055, + "objective/non_score_reward": -1.0205243825912476, + "objective/rlhf_reward": -3.682097455859184, + "objective/scores": 0.1, + "policy/approxkl_avg": 154.7986297607422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6624069213867188, + "step": 68, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982914924621582 + }, + { + "episode": 1120, + "epoch": 0.020131574217205305, + "loss/policy_avg": 0.38636407256126404, + "lr": 2.986771472392638e-06, + "objective/entropy": -56.353668212890625, + "objective/kl": 9.813121795654297, + "objective/non_score_reward": -0.9813121557235718, + "objective/rlhf_reward": -5.925248146057129, + "objective/scores": -0.5, + "policy/approxkl_avg": 89.94273376464844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.49075233936309814, + "step": 69, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0006632804870605 + }, + { + "episode": 1136, + "epoch": 0.020419168134593952, + "loss/policy_avg": 0.1683022379875183, + "lr": 2.9865797546012273e-06, + "objective/entropy": 218.84620666503906, + "objective/kl": 18.28194808959961, + "objective/non_score_reward": -1.8281950950622559, + "objective/rlhf_reward": -4.912780082225799, + "objective/scores": 0.6, + "policy/approxkl_avg": 422.34417724609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5468066930770874, + "step": 70, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993388652801514 + }, + { + "episode": 1152, + "epoch": 0.0207067620519826, + "loss/policy_avg": 0.12364174425601959, + "lr": 2.986388036809816e-06, + "objective/entropy": -157.8475341796875, + "objective/kl": 13.170989036560059, + "objective/non_score_reward": -1.3170989751815796, + "objective/rlhf_reward": -7.26839542388916, + "objective/scores": -0.5, + "policy/approxkl_avg": 184.91448974609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8540889620780945, + "step": 71, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9987365007400513 + }, + { + "episode": 1168, + "epoch": 0.02099435596937125, + "loss/policy_avg": 0.03804938867688179, + "lr": 2.986196319018405e-06, + "objective/entropy": 12.461807250976562, + "objective/kl": 7.584700107574463, + "objective/non_score_reward": -0.7584700584411621, + "objective/rlhf_reward": -0.11016108536836766, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 53.31656265258789, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7939921617507935, + "step": 72, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0002026557922363 + }, + { + "episode": 1184, + "epoch": 0.021281949886759896, + "loss/policy_avg": 0.4785844683647156, + "lr": 2.986004601226994e-06, + "objective/entropy": -64.63442993164062, + "objective/kl": 13.00765609741211, + "objective/non_score_reward": -1.3007656335830688, + "objective/rlhf_reward": -4.803062631189823, + "objective/scores": 0.1, + "policy/approxkl_avg": 264.447998046875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6026296615600586, + "step": 73, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995261430740356 + }, + { + "episode": 1200, + "epoch": 0.021569543804148543, + "loss/policy_avg": 0.22290995717048645, + "lr": 2.985812883435583e-06, + "objective/entropy": -106.69702911376953, + "objective/kl": 13.168065071105957, + "objective/non_score_reward": -1.316806435585022, + "objective/rlhf_reward": -7.267225742340088, + "objective/scores": -0.5, + "policy/approxkl_avg": 273.555419921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6650924682617188, + "step": 74, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982670545578003 + }, + { + "episode": 1216, + "epoch": 0.02185713772153719, + "loss/policy_avg": 0.29405301809310913, + "lr": 2.985621165644172e-06, + "objective/entropy": 133.62835693359375, + "objective/kl": 8.014554023742676, + "objective/non_score_reward": -0.8014553189277649, + "objective/rlhf_reward": -2.8058213055133816, + "objective/scores": 0.1, + "policy/approxkl_avg": 96.36099243164062, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7634880542755127, + "step": 75, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991488456726074 + }, + { + "episode": 1232, + "epoch": 0.022144731638925837, + "loss/policy_avg": 0.45445433259010315, + "lr": 2.985429447852761e-06, + "objective/entropy": 82.93301391601562, + "objective/kl": 8.670784950256348, + "objective/non_score_reward": -0.8670786023139954, + "objective/rlhf_reward": -1.0683142602443696, + "objective/scores": 0.6, + "policy/approxkl_avg": 148.96737670898438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.540373682975769, + "step": 76, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980230331420898 + }, + { + "episode": 1248, + "epoch": 0.022432325556314484, + "loss/policy_avg": 0.020047597587108612, + "lr": 2.98523773006135e-06, + "objective/entropy": -212.8873291015625, + "objective/kl": 5.708805561065674, + "objective/non_score_reward": -0.5708805918693542, + "objective/rlhf_reward": 0.6401966915118966, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 42.32015609741211, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7310128211975098, + "step": 77, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9995827674865723 + }, + { + "episode": 1264, + "epoch": 0.02271991947370313, + "loss/policy_avg": -0.00048611266538500786, + "lr": 2.9850460122699387e-06, + "objective/entropy": -48.7366943359375, + "objective/kl": 6.82136344909668, + "objective/non_score_reward": -0.6821364164352417, + "objective/rlhf_reward": -2.3285456061363217, + "objective/scores": 0.1, + "policy/approxkl_avg": 66.36034393310547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4771096706390381, + "step": 78, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971214532852173 + }, + { + "episode": 1280, + "epoch": 0.023007513391091777, + "loss/policy_avg": 0.4775531589984894, + "lr": 2.9848542944785275e-06, + "objective/entropy": -153.65670776367188, + "objective/kl": 11.706863403320312, + "objective/non_score_reward": -1.1706863641738892, + "objective/rlhf_reward": -3.078625496391373, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 175.5146484375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5806171894073486, + "step": 79, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999029278755188 + }, + { + "episode": 1296, + "epoch": 0.023295107308480424, + "loss/policy_avg": 0.46404796838760376, + "lr": 2.9846625766871167e-06, + "objective/entropy": -0.7676467895507812, + "objective/kl": 9.653882026672363, + "objective/non_score_reward": -0.9653880596160889, + "objective/rlhf_reward": -3.461552521586418, + "objective/scores": 0.1, + "policy/approxkl_avg": 124.8095703125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6914379596710205, + "step": 80, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988126754760742 + }, + { + "episode": 1312, + "epoch": 0.023582701225869074, + "loss/policy_avg": 0.1949668675661087, + "lr": 2.9844708588957055e-06, + "objective/entropy": -137.8944549560547, + "objective/kl": 9.551393508911133, + "objective/non_score_reward": -0.9551393985748291, + "objective/rlhf_reward": 0.5794423833489422, + "objective/scores": 1.1, + "policy/approxkl_avg": 131.54342651367188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7368413209915161, + "step": 81, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991984367370605 + }, + { + "episode": 1328, + "epoch": 0.02387029514325772, + "loss/policy_avg": 0.08234795928001404, + "lr": 2.9842791411042943e-06, + "objective/entropy": -301.7047119140625, + "objective/kl": 11.51591682434082, + "objective/non_score_reward": -1.1515917778015137, + "objective/rlhf_reward": -4.20636705160141, + "objective/scores": 0.1, + "policy/approxkl_avg": 113.93853759765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.785065770149231, + "step": 82, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992117881774902 + }, + { + "episode": 1344, + "epoch": 0.024157889060646368, + "loss/policy_avg": 0.11510531604290009, + "lr": 2.9840874233128835e-06, + "objective/entropy": -97.20633697509766, + "objective/kl": 10.416614532470703, + "objective/non_score_reward": -1.0416613817214966, + "objective/rlhf_reward": -2.3418168380585422, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 169.93270874023438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6234397888183594, + "step": 83, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0019311904907227 + }, + { + "episode": 1360, + "epoch": 0.024445482978035015, + "loss/policy_avg": -0.0069921668618917465, + "lr": 2.9838957055214724e-06, + "objective/entropy": 30.303848266601562, + "objective/kl": 8.926748275756836, + "objective/non_score_reward": -0.8926749229431152, + "objective/rlhf_reward": -5.570699691772461, + "objective/scores": -0.5, + "policy/approxkl_avg": 76.47716522216797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5039442181587219, + "step": 84, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998093843460083 + }, + { + "episode": 1376, + "epoch": 0.024733076895423662, + "loss/policy_avg": 0.27738839387893677, + "lr": 2.983703987730061e-06, + "objective/entropy": -166.7930450439453, + "objective/kl": 6.530454635620117, + "objective/non_score_reward": -0.6530454754829407, + "objective/rlhf_reward": -2.2121819466352464, + "objective/scores": 0.1, + "policy/approxkl_avg": 59.98471450805664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.90630704164505, + "step": 85, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998368263244629 + }, + { + "episode": 1392, + "epoch": 0.02502067081281231, + "loss/policy_avg": 0.22002673149108887, + "lr": 2.9835122699386504e-06, + "objective/entropy": -67.69169616699219, + "objective/kl": 11.10516357421875, + "objective/non_score_reward": -1.1105163097381592, + "objective/rlhf_reward": -2.6172365203228702, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 100.68114471435547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6418424844741821, + "step": 86, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9970009326934814 + }, + { + "episode": 1408, + "epoch": 0.025308264730200956, + "loss/policy_avg": 0.41238439083099365, + "lr": 2.983320552147239e-06, + "objective/entropy": -16.58879852294922, + "objective/kl": 8.708709716796875, + "objective/non_score_reward": -0.8708709478378296, + "objective/rlhf_reward": -5.483483791351318, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.49099731445312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6647894978523254, + "step": 87, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9938620328903198 + }, + { + "episode": 1424, + "epoch": 0.025595858647589603, + "loss/policy_avg": -0.10927846282720566, + "lr": 2.9831288343558284e-06, + "objective/entropy": 16.377220153808594, + "objective/kl": 13.530142784118652, + "objective/non_score_reward": -1.353014349937439, + "objective/rlhf_reward": -3.2893511898079257, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 238.56805419921875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6255956888198853, + "step": 88, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985899925231934 + }, + { + "episode": 1440, + "epoch": 0.02588345256497825, + "loss/policy_avg": 0.1418202817440033, + "lr": 2.9829371165644172e-06, + "objective/entropy": 156.89816284179688, + "objective/kl": 12.802512168884277, + "objective/non_score_reward": -1.2802512645721436, + "objective/rlhf_reward": -3.296176548275064, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 294.27996826171875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6715450286865234, + "step": 89, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973416328430176 + }, + { + "episode": 1456, + "epoch": 0.026171046482366896, + "loss/policy_avg": 0.008285747841000557, + "lr": 2.982745398773006e-06, + "objective/entropy": 97.516357421875, + "objective/kl": 9.460161209106445, + "objective/non_score_reward": -0.9460161328315735, + "objective/rlhf_reward": 0.6159354835748676, + "objective/scores": 1.1, + "policy/approxkl_avg": 122.57460021972656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8431274890899658, + "step": 90, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984780550003052 + }, + { + "episode": 1472, + "epoch": 0.026458640399755547, + "loss/policy_avg": 0.5656089782714844, + "lr": 2.9825536809815953e-06, + "objective/entropy": 103.79216766357422, + "objective/kl": 8.058956146240234, + "objective/non_score_reward": -0.8058955669403076, + "objective/rlhf_reward": -2.8235821485519406, + "objective/scores": 0.1, + "policy/approxkl_avg": 123.79965209960938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46594178676605225, + "step": 91, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9949241876602173 + }, + { + "episode": 1488, + "epoch": 0.026746234317144194, + "loss/policy_avg": 0.6543309688568115, + "lr": 2.982361963190184e-06, + "objective/entropy": -186.4047393798828, + "objective/kl": 11.067312240600586, + "objective/non_score_reward": -1.1067311763763428, + "objective/rlhf_reward": -2.479513700084622, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 161.9315948486328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6305772066116333, + "step": 92, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993164539337158 + }, + { + "episode": 1504, + "epoch": 0.02703382823453284, + "loss/policy_avg": 0.19107398390769958, + "lr": 2.9821702453987733e-06, + "objective/entropy": -34.54255294799805, + "objective/kl": 7.058377265930176, + "objective/non_score_reward": -0.7058378458023071, + "objective/rlhf_reward": 1.5766486465930942, + "objective/scores": 1.1, + "policy/approxkl_avg": 83.1898193359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5186392068862915, + "step": 93, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9963679313659668 + }, + { + "episode": 1520, + "epoch": 0.027321422151921487, + "loss/policy_avg": 0.0312882624566555, + "lr": 2.981978527607362e-06, + "objective/entropy": 19.131616592407227, + "objective/kl": 7.02040958404541, + "objective/non_score_reward": -0.7020410299301147, + "objective/rlhf_reward": -4.808164119720459, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.332481384277344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6216336488723755, + "step": 94, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9971067905426025 + }, + { + "episode": 1536, + "epoch": 0.027609016069310134, + "loss/policy_avg": 0.08140967786312103, + "lr": 2.981786809815951e-06, + "objective/entropy": 13.800872802734375, + "objective/kl": 6.789825439453125, + "objective/non_score_reward": -0.6789825558662415, + "objective/rlhf_reward": -4.715930461883545, + "objective/scores": -0.5, + "policy/approxkl_avg": 50.50202560424805, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5023022890090942, + "step": 95, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999751329421997 + }, + { + "episode": 1552, + "epoch": 0.02789660998669878, + "loss/policy_avg": 0.1259385198354721, + "lr": 2.98159509202454e-06, + "objective/entropy": 1.0717048645019531, + "objective/kl": 10.119159698486328, + "objective/non_score_reward": -1.011915922164917, + "objective/rlhf_reward": -3.6476640462875363, + "objective/scores": 0.1, + "policy/approxkl_avg": 155.03924560546875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5893478393554688, + "step": 96, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999608039855957 + }, + { + "episode": 1568, + "epoch": 0.028184203904087428, + "loss/policy_avg": 0.35828953981399536, + "lr": 2.981403374233129e-06, + "objective/entropy": 90.25312042236328, + "objective/kl": 18.374267578125, + "objective/non_score_reward": -1.8374266624450684, + "objective/rlhf_reward": -2.949706649780273, + "objective/scores": 1.1, + "policy/approxkl_avg": 272.5823669433594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4844147562980652, + "step": 97, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982144832611084 + }, + { + "episode": 1584, + "epoch": 0.028471797821476075, + "loss/policy_avg": 0.37511101365089417, + "lr": 2.981211656441718e-06, + "objective/entropy": 57.79621505737305, + "objective/kl": 11.18044662475586, + "objective/non_score_reward": -1.1180447340011597, + "objective/rlhf_reward": -6.4721784591674805, + "objective/scores": -0.5, + "policy/approxkl_avg": 282.01220703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4804549217224121, + "step": 98, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971604347229004 + }, + { + "episode": 1600, + "epoch": 0.02875939173886472, + "loss/policy_avg": 0.43748798966407776, + "lr": 2.981019938650307e-06, + "objective/entropy": -91.33480834960938, + "objective/kl": 9.78958797454834, + "objective/non_score_reward": -0.9789588451385498, + "objective/rlhf_reward": -5.915835380554199, + "objective/scores": -0.5, + "policy/approxkl_avg": 210.7143096923828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6887623071670532, + "step": 99, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999546766281128 + }, + { + "episode": 1616, + "epoch": 0.029046985656253372, + "loss/policy_avg": 0.012114331126213074, + "lr": 2.980828220858896e-06, + "objective/entropy": 21.23130226135254, + "objective/kl": 3.5349526405334473, + "objective/non_score_reward": -0.3534952402114868, + "objective/rlhf_reward": 1.509737993835238, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 18.795394897460938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.649235188961029, + "step": 100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0008366107940674 + }, + { + "episode": 1632, + "epoch": 0.02933457957364202, + "loss/policy_avg": 0.08204736560583115, + "lr": 2.9806365030674847e-06, + "objective/entropy": -141.25718688964844, + "objective/kl": 11.29146957397461, + "objective/non_score_reward": -1.1291468143463135, + "objective/rlhf_reward": -0.11658706367015803, + "objective/scores": 1.1, + "policy/approxkl_avg": 166.33197021484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6492342352867126, + "step": 101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9966166019439697 + }, + { + "episode": 1648, + "epoch": 0.029622173491030666, + "loss/policy_avg": 0.08084648847579956, + "lr": 2.9804447852760735e-06, + "objective/entropy": 111.48165893554688, + "objective/kl": 7.421027183532715, + "objective/non_score_reward": -0.7421026825904846, + "objective/rlhf_reward": 1.4315892295911912, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.68070983886719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7161847949028015, + "step": 102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976788759231567 + }, + { + "episode": 1664, + "epoch": 0.029909767408419313, + "loss/policy_avg": 0.34747451543807983, + "lr": 2.9802530674846627e-06, + "objective/entropy": 11.092723846435547, + "objective/kl": 10.786249160766602, + "objective/non_score_reward": -1.078624963760376, + "objective/rlhf_reward": -6.314499855041504, + "objective/scores": -0.5, + "policy/approxkl_avg": 129.8665008544922, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6497814655303955, + "step": 103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999152660369873 + }, + { + "episode": 1680, + "epoch": 0.03019736132580796, + "loss/policy_avg": 0.19717364013195038, + "lr": 2.9800613496932515e-06, + "objective/entropy": 84.13946533203125, + "objective/kl": 14.11801528930664, + "objective/non_score_reward": -1.4118015766143799, + "objective/rlhf_reward": -7.6472063064575195, + "objective/scores": -0.5, + "policy/approxkl_avg": 345.1640319824219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7492287158966064, + "step": 104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9964749813079834 + }, + { + "episode": 1696, + "epoch": 0.030484955243196606, + "loss/policy_avg": 0.31150949001312256, + "lr": 2.9798696319018403e-06, + "objective/entropy": 189.52505493164062, + "objective/kl": 6.602322578430176, + "objective/non_score_reward": -0.6602323055267334, + "objective/rlhf_reward": -2.2409292221069332, + "objective/scores": 0.1, + "policy/approxkl_avg": 36.76777648925781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7701338529586792, + "step": 105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979405403137207 + }, + { + "episode": 1712, + "epoch": 0.030772549160585253, + "loss/policy_avg": 0.863737940788269, + "lr": 2.9796779141104296e-06, + "objective/entropy": -57.84851837158203, + "objective/kl": 10.454719543457031, + "objective/non_score_reward": -1.0454717874526978, + "objective/rlhf_reward": -3.7818873882293698, + "objective/scores": 0.1, + "policy/approxkl_avg": 81.14009094238281, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6733952164649963, + "step": 106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979674816131592 + }, + { + "episode": 1728, + "epoch": 0.0310601430779739, + "loss/policy_avg": 0.3714882731437683, + "lr": 2.9794861963190184e-06, + "objective/entropy": -48.674835205078125, + "objective/kl": 8.692556381225586, + "objective/non_score_reward": -0.869255542755127, + "objective/rlhf_reward": -1.354315804616485, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 95.59877014160156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5013031959533691, + "step": 107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974138736724854 + }, + { + "episode": 1744, + "epoch": 0.03134773699536255, + "loss/policy_avg": 0.279630184173584, + "lr": 2.9792944785276076e-06, + "objective/entropy": 45.82620620727539, + "objective/kl": 6.540558338165283, + "objective/non_score_reward": -0.6540557742118835, + "objective/rlhf_reward": -4.616223335266113, + "objective/scores": -0.5, + "policy/approxkl_avg": 68.67784118652344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46648138761520386, + "step": 108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995229244232178 + }, + { + "episode": 1760, + "epoch": 0.0316353309127512, + "loss/policy_avg": 0.08991475403308868, + "lr": 2.9791027607361964e-06, + "objective/entropy": -64.23330688476562, + "objective/kl": 8.345191955566406, + "objective/non_score_reward": -0.8345192074775696, + "objective/rlhf_reward": -2.938076882064342, + "objective/scores": 0.1, + "policy/approxkl_avg": 93.55230712890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48790040612220764, + "step": 109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989550113677979 + }, + { + "episode": 1776, + "epoch": 0.031922924830139844, + "loss/policy_avg": 0.28548339009284973, + "lr": 2.9789110429447852e-06, + "objective/entropy": 43.4534912109375, + "objective/kl": 13.334844589233398, + "objective/non_score_reward": -1.333484411239624, + "objective/rlhf_reward": -4.933937734365463, + "objective/scores": 0.1, + "policy/approxkl_avg": 361.4084167480469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.721168041229248, + "step": 110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000476121902466 + }, + { + "episode": 1792, + "epoch": 0.03221051874752849, + "loss/policy_avg": 0.4107435941696167, + "lr": 2.9787193251533744e-06, + "objective/entropy": -182.18148803710938, + "objective/kl": 7.509696006774902, + "objective/non_score_reward": -0.750969648361206, + "objective/rlhf_reward": 1.396121428906918, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.65406036376953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5355631113052368, + "step": 111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999549388885498 + }, + { + "episode": 1808, + "epoch": 0.03249811266491714, + "loss/policy_avg": 0.3796160817146301, + "lr": 2.9785276073619633e-06, + "objective/entropy": 124.50952911376953, + "objective/kl": 8.484485626220703, + "objective/non_score_reward": -0.8484484553337097, + "objective/rlhf_reward": -5.393794059753418, + "objective/scores": -0.5, + "policy/approxkl_avg": 126.5394287109375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6620683670043945, + "step": 112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979877471923828 + }, + { + "episode": 1824, + "epoch": 0.032785706582305785, + "loss/policy_avg": 0.26249387860298157, + "lr": 2.978335889570552e-06, + "objective/entropy": 25.24797821044922, + "objective/kl": 13.403532028198242, + "objective/non_score_reward": -1.34035325050354, + "objective/rlhf_reward": -0.961412942409515, + "objective/scores": 1.1, + "policy/approxkl_avg": 187.52792358398438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6351895332336426, + "step": 113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988791942596436 + }, + { + "episode": 1840, + "epoch": 0.03307330049969443, + "loss/policy_avg": 0.3651992380619049, + "lr": 2.9781441717791413e-06, + "objective/entropy": 95.998779296875, + "objective/kl": 13.46788215637207, + "objective/non_score_reward": -1.3467882871627808, + "objective/rlhf_reward": -7.387153148651123, + "objective/scores": -0.5, + "policy/approxkl_avg": 247.23101806640625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6072109937667847, + "step": 114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9954397678375244 + }, + { + "episode": 1856, + "epoch": 0.03336089441708308, + "loss/policy_avg": 0.3738645017147064, + "lr": 2.97795245398773e-06, + "objective/entropy": 247.19410705566406, + "objective/kl": 14.608449935913086, + "objective/non_score_reward": -1.4608449935913086, + "objective/rlhf_reward": -7.843379974365234, + "objective/scores": -0.5, + "policy/approxkl_avg": 280.8343505859375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7322804927825928, + "step": 115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977314472198486 + }, + { + "episode": 1872, + "epoch": 0.033648488334471725, + "loss/policy_avg": 0.39061659574508667, + "lr": 2.9777607361963193e-06, + "objective/entropy": 58.74927520751953, + "objective/kl": 11.686922073364258, + "objective/non_score_reward": -1.1686923503875732, + "objective/rlhf_reward": -6.674769401550293, + "objective/scores": -0.5, + "policy/approxkl_avg": 226.12789916992188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48344868421554565, + "step": 116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999885559082031 + }, + { + "episode": 1888, + "epoch": 0.03393608225186037, + "loss/policy_avg": 0.3333742022514343, + "lr": 2.977569018404908e-06, + "objective/entropy": -19.94247055053711, + "objective/kl": 9.790740966796875, + "objective/non_score_reward": -0.979074239730835, + "objective/rlhf_reward": -0.9925778030764787, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 72.29800415039062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5458022952079773, + "step": 117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997154951095581 + }, + { + "episode": 1904, + "epoch": 0.03422367616924902, + "loss/policy_avg": 0.09615316987037659, + "lr": 2.977377300613497e-06, + "objective/entropy": -167.89923095703125, + "objective/kl": 8.815143585205078, + "objective/non_score_reward": -0.8815143704414368, + "objective/rlhf_reward": -3.1260574519634243, + "objective/scores": 0.1, + "policy/approxkl_avg": 45.062522888183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8456206321716309, + "step": 118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0000126361846924 + }, + { + "episode": 1920, + "epoch": 0.034511270086637666, + "loss/policy_avg": 0.08710992336273193, + "lr": 2.977185582822086e-06, + "objective/entropy": 263.03179931640625, + "objective/kl": 11.179251670837402, + "objective/non_score_reward": -1.1179251670837402, + "objective/rlhf_reward": -4.071700690686702, + "objective/scores": 0.1, + "policy/approxkl_avg": 161.03988647460938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7780265808105469, + "step": 119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994699954986572 + }, + { + "episode": 1936, + "epoch": 0.03479886400402631, + "loss/policy_avg": 0.13407912850379944, + "lr": 2.976993865030675e-06, + "objective/entropy": -70.0582504272461, + "objective/kl": 6.793869972229004, + "objective/non_score_reward": -0.6793869733810425, + "objective/rlhf_reward": 0.2061711356628213, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 59.43596649169922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7297772169113159, + "step": 120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998098611831665 + }, + { + "episode": 1952, + "epoch": 0.03508645792141496, + "loss/policy_avg": 0.06745412945747375, + "lr": 2.9768021472392642e-06, + "objective/entropy": 70.17347717285156, + "objective/kl": 9.706807136535645, + "objective/non_score_reward": -0.9706807136535645, + "objective/rlhf_reward": -1.9353115958737686, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 101.62091827392578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5260573625564575, + "step": 121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000791549682617 + }, + { + "episode": 1968, + "epoch": 0.03537405183880361, + "loss/policy_avg": 0.056749723851680756, + "lr": 2.976610429447853e-06, + "objective/entropy": -228.255615234375, + "objective/kl": 5.5145463943481445, + "objective/non_score_reward": -0.5514546632766724, + "objective/rlhf_reward": -1.8058185189962388, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.354915618896484, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6759341955184937, + "step": 122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004634857177734 + }, + { + "episode": 1984, + "epoch": 0.03566164575619225, + "loss/policy_avg": 0.08238844573497772, + "lr": 2.9764187116564414e-06, + "objective/entropy": 241.84060668945312, + "objective/kl": 9.057453155517578, + "objective/non_score_reward": -0.905745267868042, + "objective/rlhf_reward": -5.622981071472168, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.88520050048828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6288118362426758, + "step": 123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0009031295776367 + }, + { + "episode": 2000, + "epoch": 0.03594923967358091, + "loss/policy_avg": 0.5170639753341675, + "lr": 2.9762269938650307e-06, + "objective/entropy": -229.75860595703125, + "objective/kl": 6.897617816925049, + "objective/non_score_reward": -0.6897618174552917, + "objective/rlhf_reward": 0.1646718115198883, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 75.08253479003906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8155406713485718, + "step": 124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9993102550506592 + }, + { + "episode": 2016, + "epoch": 0.036236833590969554, + "loss/policy_avg": 0.23536163568496704, + "lr": 2.9760352760736195e-06, + "objective/entropy": 104.1327896118164, + "objective/kl": 8.855351448059082, + "objective/non_score_reward": -0.8855351209640503, + "objective/rlhf_reward": -0.6184214695703713, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 104.30943298339844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7742270231246948, + "step": 125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001922607421875 + }, + { + "episode": 2032, + "epoch": 0.0365244275083582, + "loss/policy_avg": 0.3929940164089203, + "lr": 2.9758435582822087e-06, + "objective/entropy": 19.527324676513672, + "objective/kl": 7.849102973937988, + "objective/non_score_reward": -0.7849102020263672, + "objective/rlhf_reward": -0.7396408528089523, + "objective/scores": 0.6, + "policy/approxkl_avg": 43.17992401123047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49572157859802246, + "step": 126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000518321990967 + }, + { + "episode": 2048, + "epoch": 0.03681202142574685, + "loss/policy_avg": 0.02761128917336464, + "lr": 2.9756518404907975e-06, + "objective/entropy": -31.197162628173828, + "objective/kl": 11.224246978759766, + "objective/non_score_reward": -1.122424602508545, + "objective/rlhf_reward": -6.48969841003418, + "objective/scores": -0.5, + "policy/approxkl_avg": 167.82305908203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6054507493972778, + "step": 127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998534917831421 + }, + { + "episode": 2064, + "epoch": 0.037099615343135495, + "loss/policy_avg": 0.8324223756790161, + "lr": 2.9754601226993863e-06, + "objective/entropy": -79.34418487548828, + "objective/kl": 9.576016426086426, + "objective/non_score_reward": -0.95760178565979, + "objective/rlhf_reward": -5.83040714263916, + "objective/scores": -0.5, + "policy/approxkl_avg": 189.614990234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6984212398529053, + "step": 128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9968839883804321 + }, + { + "episode": 2080, + "epoch": 0.03738720926052414, + "loss/policy_avg": 0.13773420453071594, + "lr": 2.9752684049079756e-06, + "objective/entropy": 29.27914047241211, + "objective/kl": 6.572282791137695, + "objective/non_score_reward": -0.6572283506393433, + "objective/rlhf_reward": -1.0247934012749966, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 35.22698974609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4145284593105316, + "step": 129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000413417816162 + }, + { + "episode": 2096, + "epoch": 0.03767480317791279, + "loss/policy_avg": 0.14625512063503265, + "lr": 2.9750766871165644e-06, + "objective/entropy": 248.92510986328125, + "objective/kl": 8.044551849365234, + "objective/non_score_reward": -0.8044552803039551, + "objective/rlhf_reward": -5.21782112121582, + "objective/scores": -0.5, + "policy/approxkl_avg": 65.03269958496094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7755135893821716, + "step": 130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9952176809310913 + }, + { + "episode": 2112, + "epoch": 0.037962397095301435, + "loss/policy_avg": 0.08024582266807556, + "lr": 2.9748849693251536e-06, + "objective/entropy": 122.44419860839844, + "objective/kl": 17.353057861328125, + "objective/non_score_reward": -1.735305666923523, + "objective/rlhf_reward": -6.541222697496414, + "objective/scores": 0.1, + "policy/approxkl_avg": 378.7543029785156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7671464681625366, + "step": 131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9952489137649536 + }, + { + "episode": 2128, + "epoch": 0.03824999101269008, + "loss/policy_avg": 0.052941206842660904, + "lr": 2.9746932515337424e-06, + "objective/entropy": 12.529216766357422, + "objective/kl": 17.990718841552734, + "objective/non_score_reward": -1.799072027206421, + "objective/rlhf_reward": -2.7962879896163937, + "objective/scores": 1.1, + "policy/approxkl_avg": 328.53741455078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.543836236000061, + "step": 132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990971088409424 + }, + { + "episode": 2144, + "epoch": 0.03853758493007873, + "loss/policy_avg": 0.08809210360050201, + "lr": 2.9745015337423312e-06, + "objective/entropy": 92.10977172851562, + "objective/kl": 3.5646705627441406, + "objective/non_score_reward": -0.35646697878837585, + "objective/rlhf_reward": 2.974132025241852, + "objective/scores": 1.1, + "policy/approxkl_avg": 13.450565338134766, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5211790800094604, + "step": 133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005075931549072 + }, + { + "episode": 2160, + "epoch": 0.038825178847467376, + "loss/policy_avg": 0.25067800283432007, + "lr": 2.9743098159509205e-06, + "objective/entropy": -342.741455078125, + "objective/kl": 13.730775833129883, + "objective/non_score_reward": -1.373077630996704, + "objective/rlhf_reward": -5.092310494184494, + "objective/scores": 0.1, + "policy/approxkl_avg": 226.23080444335938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7523109912872314, + "step": 134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9971081018447876 + }, + { + "episode": 2176, + "epoch": 0.03911277276485602, + "loss/policy_avg": -0.05823849141597748, + "lr": 2.9741180981595093e-06, + "objective/entropy": 177.39581298828125, + "objective/kl": 2.4112541675567627, + "objective/non_score_reward": -0.2411254346370697, + "objective/rlhf_reward": -2.9645018577575684, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.3866167068481445, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4305086135864258, + "step": 135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0287272930145264 + }, + { + "episode": 2192, + "epoch": 0.03940036668224467, + "loss/policy_avg": 0.39458757638931274, + "lr": 2.973926380368098e-06, + "objective/entropy": -71.98441314697266, + "objective/kl": 11.294361114501953, + "objective/non_score_reward": -1.1294360160827637, + "objective/rlhf_reward": -1.5940250202429025, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 172.00338745117188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7612372040748596, + "step": 136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9963352680206299 + }, + { + "episode": 2208, + "epoch": 0.039687960599633317, + "loss/policy_avg": 0.6298972368240356, + "lr": 2.9737346625766873e-06, + "objective/entropy": -57.89506530761719, + "objective/kl": 8.208264350891113, + "objective/non_score_reward": -0.8208264112472534, + "objective/rlhf_reward": -2.8833055704832073, + "objective/scores": 0.1, + "policy/approxkl_avg": 89.37754821777344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.817412257194519, + "step": 137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989038705825806 + }, + { + "episode": 2224, + "epoch": 0.03997555451702196, + "loss/policy_avg": 0.07556813955307007, + "lr": 2.973542944785276e-06, + "objective/entropy": -190.96238708496094, + "objective/kl": 15.059877395629883, + "objective/non_score_reward": -1.5059877634048462, + "objective/rlhf_reward": -1.6239511057734486, + "objective/scores": 1.1, + "policy/approxkl_avg": 377.60052490234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6793420314788818, + "step": 138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978358745574951 + }, + { + "episode": 2240, + "epoch": 0.04026314843441061, + "loss/policy_avg": 0.1697106957435608, + "lr": 2.9733512269938653e-06, + "objective/entropy": 53.708675384521484, + "objective/kl": 8.52203369140625, + "objective/non_score_reward": -0.8522033095359802, + "objective/rlhf_reward": -3.0088131859898564, + "objective/scores": 0.1, + "policy/approxkl_avg": 61.883544921875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5295162796974182, + "step": 139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995434284210205 + }, + { + "episode": 2256, + "epoch": 0.04055074235179926, + "loss/policy_avg": 0.4851709008216858, + "lr": 2.973159509202454e-06, + "objective/entropy": -114.12245178222656, + "objective/kl": 8.938103675842285, + "objective/non_score_reward": -0.8938103914260864, + "objective/rlhf_reward": -3.1752414911985394, + "objective/scores": 0.1, + "policy/approxkl_avg": 119.33918762207031, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.567252516746521, + "step": 140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977459907531738 + }, + { + "episode": 2272, + "epoch": 0.040838336269187904, + "loss/policy_avg": 0.011087119579315186, + "lr": 2.972967791411043e-06, + "objective/entropy": -39.64904022216797, + "objective/kl": 10.655853271484375, + "objective/non_score_reward": -1.0655853748321533, + "objective/rlhf_reward": -2.139635088221107, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 89.38186645507812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6346065998077393, + "step": 141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0012426376342773 + }, + { + "episode": 2288, + "epoch": 0.04112593018657655, + "loss/policy_avg": 0.18634071946144104, + "lr": 2.972776073619632e-06, + "objective/entropy": -234.11532592773438, + "objective/kl": 9.971248626708984, + "objective/non_score_reward": -0.9971247911453247, + "objective/rlhf_reward": -5.988499641418457, + "objective/scores": -0.5, + "policy/approxkl_avg": 161.76507568359375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6083584427833557, + "step": 142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975717067718506 + }, + { + "episode": 2304, + "epoch": 0.0414135241039652, + "loss/policy_avg": 0.842505693435669, + "lr": 2.972584355828221e-06, + "objective/entropy": 146.90762329101562, + "objective/kl": 12.336867332458496, + "objective/non_score_reward": -1.2336868047714233, + "objective/rlhf_reward": -4.534747010469436, + "objective/scores": 0.1, + "policy/approxkl_avg": 239.418701171875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4192795753479004, + "step": 143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997382164001465 + }, + { + "episode": 2320, + "epoch": 0.04170111802135385, + "loss/policy_avg": 0.6685837507247925, + "lr": 2.9723926380368102e-06, + "objective/entropy": 41.359195709228516, + "objective/kl": 9.3118896484375, + "objective/non_score_reward": -0.9311891794204712, + "objective/rlhf_reward": 0.6752433419227604, + "objective/scores": 1.1, + "policy/approxkl_avg": 151.73721313476562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5029563307762146, + "step": 144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993720054626465 + }, + { + "episode": 2336, + "epoch": 0.0419887119387425, + "loss/policy_avg": 0.2414681613445282, + "lr": 2.9722009202453986e-06, + "objective/entropy": -22.124481201171875, + "objective/kl": 10.415138244628906, + "objective/non_score_reward": -1.0415138006210327, + "objective/rlhf_reward": -3.7660551875829693, + "objective/scores": 0.1, + "policy/approxkl_avg": 117.5968246459961, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5334524512290955, + "step": 145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998509407043457 + }, + { + "episode": 2352, + "epoch": 0.042276305856131145, + "loss/policy_avg": -0.1104317232966423, + "lr": 2.972009202453988e-06, + "objective/entropy": 89.73234558105469, + "objective/kl": 9.888954162597656, + "objective/non_score_reward": -0.9888954162597656, + "objective/rlhf_reward": -3.5555818438529965, + "objective/scores": 0.1, + "policy/approxkl_avg": 113.99063110351562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5904539823532104, + "step": 146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002488851547241 + }, + { + "episode": 2368, + "epoch": 0.04256389977351979, + "loss/policy_avg": 0.6591033935546875, + "lr": 2.9718174846625767e-06, + "objective/entropy": 149.4038543701172, + "objective/kl": 10.047257423400879, + "objective/non_score_reward": -1.0047259330749512, + "objective/rlhf_reward": -6.018903732299805, + "objective/scores": -0.5, + "policy/approxkl_avg": 178.25811767578125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9356397986412048, + "step": 147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997607707977295 + }, + { + "episode": 2384, + "epoch": 0.04285149369090844, + "loss/policy_avg": 0.544201135635376, + "lr": 2.9716257668711655e-06, + "objective/entropy": 132.01980590820312, + "objective/kl": 9.59277629852295, + "objective/non_score_reward": -0.9592776298522949, + "objective/rlhf_reward": -3.437110504508018, + "objective/scores": 0.1, + "policy/approxkl_avg": 97.24102783203125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8296840190887451, + "step": 148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0014381408691406 + }, + { + "episode": 2400, + "epoch": 0.043139087608297086, + "loss/policy_avg": 0.36106303334236145, + "lr": 2.9714340490797547e-06, + "objective/entropy": 260.59033203125, + "objective/kl": 11.327485084533691, + "objective/non_score_reward": -1.1327484846115112, + "objective/rlhf_reward": -2.4082878849664073, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 112.34231567382812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6541630029678345, + "step": 149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988218545913696 + }, + { + "episode": 2416, + "epoch": 0.04342668152568573, + "loss/policy_avg": 0.30818748474121094, + "lr": 2.9712423312883435e-06, + "objective/entropy": 167.4129180908203, + "objective/kl": 9.699304580688477, + "objective/non_score_reward": -0.9699304103851318, + "objective/rlhf_reward": -5.879721641540527, + "objective/scores": -0.5, + "policy/approxkl_avg": 82.35363006591797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6681854724884033, + "step": 150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9972755908966064 + }, + { + "episode": 2432, + "epoch": 0.04371427544307438, + "loss/policy_avg": 0.6923952102661133, + "lr": 2.9710506134969323e-06, + "objective/entropy": 48.51850128173828, + "objective/kl": 7.859927177429199, + "objective/non_score_reward": -0.7859926223754883, + "objective/rlhf_reward": -2.743970593810081, + "objective/scores": 0.1, + "policy/approxkl_avg": 103.42765808105469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7212280035018921, + "step": 151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981834888458252 + }, + { + "episode": 2448, + "epoch": 0.044001869360463026, + "loss/policy_avg": 0.11756162345409393, + "lr": 2.9708588957055216e-06, + "objective/entropy": -18.197547912597656, + "objective/kl": 9.619758605957031, + "objective/non_score_reward": -0.9619758129119873, + "objective/rlhf_reward": -2.243783268992024, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 92.98655700683594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6553933620452881, + "step": 152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9970009326934814 + }, + { + "episode": 2464, + "epoch": 0.04428946327785167, + "loss/policy_avg": 0.1137843132019043, + "lr": 2.9706671779141104e-06, + "objective/entropy": 120.47866821289062, + "objective/kl": 12.719396591186523, + "objective/non_score_reward": -1.2719398736953735, + "objective/rlhf_reward": -4.687759546935558, + "objective/scores": 0.1, + "policy/approxkl_avg": 160.37051391601562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5731196403503418, + "step": 153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005669593811035 + }, + { + "episode": 2480, + "epoch": 0.04457705719524032, + "loss/policy_avg": 0.13933295011520386, + "lr": 2.9704754601226996e-06, + "objective/entropy": -150.15121459960938, + "objective/kl": 5.141759395599365, + "objective/non_score_reward": -0.5141758918762207, + "objective/rlhf_reward": -4.056703567504883, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.49239730834961, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7959345579147339, + "step": 154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0006468296051025 + }, + { + "episode": 2496, + "epoch": 0.04486465111262897, + "loss/policy_avg": 0.0854576975107193, + "lr": 2.9702837423312884e-06, + "objective/entropy": 143.63348388671875, + "objective/kl": 11.670942306518555, + "objective/non_score_reward": -1.1670942306518555, + "objective/rlhf_reward": -4.26837727278471, + "objective/scores": 0.1, + "policy/approxkl_avg": 232.97808837890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9138456583023071, + "step": 155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9975340366363525 + }, + { + "episode": 2512, + "epoch": 0.045152245030017614, + "loss/policy_avg": 0.026141434907913208, + "lr": 2.9700920245398772e-06, + "objective/entropy": -4.6529083251953125, + "objective/kl": 7.081835746765137, + "objective/non_score_reward": -0.7081836462020874, + "objective/rlhf_reward": -4.832734107971191, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.927255630493164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6679246425628662, + "step": 156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991931915283203 + }, + { + "episode": 2528, + "epoch": 0.04543983894740626, + "loss/policy_avg": 0.15662409365177155, + "lr": 2.9699003067484665e-06, + "objective/entropy": 175.9498291015625, + "objective/kl": 6.773474216461182, + "objective/non_score_reward": -0.6773474216461182, + "objective/rlhf_reward": -2.3093897461891175, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.37593078613281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.47304588556289673, + "step": 157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973795413970947 + }, + { + "episode": 2544, + "epoch": 0.04572743286479491, + "loss/policy_avg": 0.23720163106918335, + "lr": 2.9697085889570553e-06, + "objective/entropy": -89.0332260131836, + "objective/kl": 8.619776725769043, + "objective/non_score_reward": -0.8619776964187622, + "objective/rlhf_reward": -0.5241918011915412, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 29.201887130737305, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6532964110374451, + "step": 158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988105297088623 + }, + { + "episode": 2560, + "epoch": 0.046015026782183555, + "loss/policy_avg": 0.2267841100692749, + "lr": 2.9695168711656445e-06, + "objective/entropy": 1.6421661376953125, + "objective/kl": 9.304061889648438, + "objective/non_score_reward": -0.9304060935974121, + "objective/rlhf_reward": 0.6783757746219639, + "objective/scores": 1.1, + "policy/approxkl_avg": 34.62976837158203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7413831949234009, + "step": 159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987226724624634 + }, + { + "episode": 2576, + "epoch": 0.0463026206995722, + "loss/policy_avg": 0.5093711614608765, + "lr": 2.9693251533742333e-06, + "objective/entropy": -36.8858528137207, + "objective/kl": 11.298078536987305, + "objective/non_score_reward": -1.1298078298568726, + "objective/rlhf_reward": -6.51923131942749, + "objective/scores": -0.5, + "policy/approxkl_avg": 135.19918823242188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6970053911209106, + "step": 160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0019335746765137 + }, + { + "episode": 2592, + "epoch": 0.04659021461696085, + "loss/policy_avg": 0.17401546239852905, + "lr": 2.969133435582822e-06, + "objective/entropy": 172.7603759765625, + "objective/kl": 9.036718368530273, + "objective/non_score_reward": -0.9036718606948853, + "objective/rlhf_reward": -3.2146874427795407, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.195972442626953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5838289260864258, + "step": 161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981424808502197 + }, + { + "episode": 2608, + "epoch": 0.046877808534349495, + "loss/policy_avg": 0.44493553042411804, + "lr": 2.9689417177914114e-06, + "objective/entropy": -19.54338836669922, + "objective/kl": 11.437591552734375, + "objective/non_score_reward": -1.1437591314315796, + "objective/rlhf_reward": -6.57503604888916, + "objective/scores": -0.5, + "policy/approxkl_avg": 146.39810180664062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.468772292137146, + "step": 162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998663067817688 + }, + { + "episode": 2624, + "epoch": 0.04716540245173815, + "loss/policy_avg": 2.674943447113037, + "lr": 2.96875e-06, + "objective/entropy": -46.63656234741211, + "objective/kl": 3.6555447578430176, + "objective/non_score_reward": -0.3655545115470886, + "objective/rlhf_reward": 0.48519318274981194, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 8.321266174316406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8199789524078369, + "step": 163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0079751014709473 + }, + { + "episode": 2640, + "epoch": 0.047452996369126796, + "loss/policy_avg": 0.1934666782617569, + "lr": 2.968558282208589e-06, + "objective/entropy": -81.56670379638672, + "objective/kl": 12.988276481628418, + "objective/non_score_reward": -1.2988277673721313, + "objective/rlhf_reward": -2.2715921520602436, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 293.0709228515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7369424104690552, + "step": 164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997572898864746 + }, + { + "episode": 2656, + "epoch": 0.04774059028651544, + "loss/policy_avg": 0.15999506413936615, + "lr": 2.968366564417178e-06, + "objective/entropy": 46.440673828125, + "objective/kl": 13.255866050720215, + "objective/non_score_reward": -1.3255865573883057, + "objective/rlhf_reward": -7.302346229553223, + "objective/scores": -0.5, + "policy/approxkl_avg": 191.44906616210938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8060017824172974, + "step": 165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992680549621582 + }, + { + "episode": 2672, + "epoch": 0.04802818420390409, + "loss/policy_avg": -0.11308600753545761, + "lr": 2.968174846625767e-06, + "objective/entropy": 229.09007263183594, + "objective/kl": 10.140392303466797, + "objective/non_score_reward": -1.0140392780303955, + "objective/rlhf_reward": -3.6561572611331936, + "objective/scores": 0.1, + "policy/approxkl_avg": 81.9848861694336, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8131240606307983, + "step": 166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0012285709381104 + }, + { + "episode": 2688, + "epoch": 0.048315778121292736, + "loss/policy_avg": 0.2457667887210846, + "lr": 2.967983128834356e-06, + "objective/entropy": -224.82394409179688, + "objective/kl": 5.642745018005371, + "objective/non_score_reward": -0.5642745494842529, + "objective/rlhf_reward": 0.666621024965075, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 43.03296661376953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.596450686454773, + "step": 167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9965918064117432 + }, + { + "episode": 2704, + "epoch": 0.04860337203868138, + "loss/policy_avg": 0.16231290996074677, + "lr": 2.9677914110429446e-06, + "objective/entropy": 53.96503829956055, + "objective/kl": 9.837738037109375, + "objective/non_score_reward": -0.9837738275527954, + "objective/rlhf_reward": -5.935094833374023, + "objective/scores": -0.5, + "policy/approxkl_avg": 144.24468994140625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5579663515090942, + "step": 168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979324340820312 + }, + { + "episode": 2720, + "epoch": 0.04889096595607003, + "loss/policy_avg": 0.5551764369010925, + "lr": 2.967599693251534e-06, + "objective/entropy": -89.17186737060547, + "objective/kl": 4.356380939483643, + "objective/non_score_reward": -0.4356381893157959, + "objective/rlhf_reward": 0.08227607312529184, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 33.07392120361328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44729527831077576, + "step": 169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999492168426514 + }, + { + "episode": 2736, + "epoch": 0.04917855987345868, + "loss/policy_avg": 0.4932054579257965, + "lr": 2.9674079754601227e-06, + "objective/entropy": -11.498092651367188, + "objective/kl": 10.226805686950684, + "objective/non_score_reward": -1.022680401802063, + "objective/rlhf_reward": -1.1670026972305505, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 102.75898742675781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6330607533454895, + "step": 170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996318817138672 + }, + { + "episode": 2752, + "epoch": 0.049466153790847324, + "loss/policy_avg": 0.1898762285709381, + "lr": 2.9672162576687115e-06, + "objective/entropy": -70.60189056396484, + "objective/kl": 9.331042289733887, + "objective/non_score_reward": -0.9331042766571045, + "objective/rlhf_reward": -3.3324170172214505, + "objective/scores": 0.1, + "policy/approxkl_avg": 114.848388671875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6309188604354858, + "step": 171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009915828704834 + }, + { + "episode": 2768, + "epoch": 0.04975374770823597, + "loss/policy_avg": 0.2297024428844452, + "lr": 2.9670245398773007e-06, + "objective/entropy": 16.196762084960938, + "objective/kl": 11.903242111206055, + "objective/non_score_reward": -1.1903241872787476, + "objective/rlhf_reward": -2.638590636030708, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 139.62954711914062, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.705674946308136, + "step": 172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981346130371094 + }, + { + "episode": 2784, + "epoch": 0.05004134162562462, + "loss/policy_avg": 0.39737239480018616, + "lr": 2.9668328220858895e-06, + "objective/entropy": -89.89057922363281, + "objective/kl": 7.809091567993164, + "objective/non_score_reward": -0.7809092402458191, + "objective/rlhf_reward": -5.123636722564697, + "objective/scores": -0.5, + "policy/approxkl_avg": 93.33058166503906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.9152443408966064, + "step": 173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9968818426132202 + }, + { + "episode": 2800, + "epoch": 0.050328935543013265, + "loss/policy_avg": 0.06229601055383682, + "lr": 2.9666411042944783e-06, + "objective/entropy": 105.8713607788086, + "objective/kl": 10.667573928833008, + "objective/non_score_reward": -1.0667574405670166, + "objective/rlhf_reward": -2.6629095709958843, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 72.16740417480469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5240480899810791, + "step": 174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984283447265625 + }, + { + "episode": 2816, + "epoch": 0.05061652946040191, + "loss/policy_avg": 0.23533995449543, + "lr": 2.9664493865030676e-06, + "objective/entropy": 73.5090103149414, + "objective/kl": 8.236711502075195, + "objective/non_score_reward": -0.8236711621284485, + "objective/rlhf_reward": 1.1053153514862064, + "objective/scores": 1.1, + "policy/approxkl_avg": 86.34696197509766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.549429714679718, + "step": 175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980177879333496 + }, + { + "episode": 2832, + "epoch": 0.05090412337779056, + "loss/policy_avg": 0.23864325881004333, + "lr": 2.9662576687116564e-06, + "objective/entropy": 44.60013198852539, + "objective/kl": 11.681440353393555, + "objective/non_score_reward": -1.1681439876556396, + "objective/rlhf_reward": -4.272575950622558, + "objective/scores": 0.1, + "policy/approxkl_avg": 141.5290985107422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6180237531661987, + "step": 176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996826410293579 + }, + { + "episode": 2848, + "epoch": 0.051191717295179205, + "loss/policy_avg": 0.12463901191949844, + "lr": 2.9660659509202456e-06, + "objective/entropy": -185.71621704101562, + "objective/kl": 6.215152263641357, + "objective/non_score_reward": -0.6215152740478516, + "objective/rlhf_reward": 1.9139389447867874, + "objective/scores": 1.1, + "policy/approxkl_avg": 31.746864318847656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6655900478363037, + "step": 177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984078407287598 + }, + { + "episode": 2864, + "epoch": 0.05147931121256785, + "loss/policy_avg": 0.41434115171432495, + "lr": 2.9658742331288344e-06, + "objective/entropy": 125.62101745605469, + "objective/kl": 9.051776885986328, + "objective/non_score_reward": -0.9051777124404907, + "objective/rlhf_reward": -1.2207108795642851, + "objective/scores": 0.6, + "policy/approxkl_avg": 89.37495422363281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6031736731529236, + "step": 178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9960637092590332 + }, + { + "episode": 2880, + "epoch": 0.0517669051299565, + "loss/policy_avg": 0.0819600522518158, + "lr": 2.9656825153374232e-06, + "objective/entropy": 167.4649658203125, + "objective/kl": 11.02088737487793, + "objective/non_score_reward": -1.1020888090133667, + "objective/rlhf_reward": -2.7464959226256473, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 87.59506225585938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5513530969619751, + "step": 179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990684986114502 + }, + { + "episode": 2896, + "epoch": 0.052054499047345146, + "loss/policy_avg": 0.28252676129341125, + "lr": 2.9654907975460125e-06, + "objective/entropy": -71.35255432128906, + "objective/kl": 6.714944362640381, + "objective/non_score_reward": -0.6714943647384644, + "objective/rlhf_reward": 1.7140224814414982, + "objective/scores": 1.1, + "policy/approxkl_avg": 61.805137634277344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5994826555252075, + "step": 180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976210594177246 + }, + { + "episode": 2912, + "epoch": 0.05234209296473379, + "loss/policy_avg": 0.0703403651714325, + "lr": 2.9652990797546013e-06, + "objective/entropy": -239.35452270507812, + "objective/kl": 10.807499885559082, + "objective/non_score_reward": -1.0807499885559082, + "objective/rlhf_reward": -6.322999954223633, + "objective/scores": -0.5, + "policy/approxkl_avg": 269.2040710449219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.626622200012207, + "step": 181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9963631629943848 + }, + { + "episode": 2928, + "epoch": 0.052629686882122446, + "loss/policy_avg": 0.5756047964096069, + "lr": 2.9651073619631905e-06, + "objective/entropy": 81.2969741821289, + "objective/kl": 9.503179550170898, + "objective/non_score_reward": -0.9503180384635925, + "objective/rlhf_reward": -1.6785659066596368, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 69.6463394165039, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4174951910972595, + "step": 182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000248432159424 + }, + { + "episode": 2944, + "epoch": 0.05291728079951109, + "loss/policy_avg": 0.19677188992500305, + "lr": 2.9649156441717793e-06, + "objective/entropy": 97.329345703125, + "objective/kl": 7.023814678192139, + "objective/non_score_reward": -0.702381432056427, + "objective/rlhf_reward": -4.809525966644287, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.869384765625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5866813659667969, + "step": 183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983813762664795 + }, + { + "episode": 2960, + "epoch": 0.05320487471689974, + "loss/policy_avg": -0.09021998941898346, + "lr": 2.964723926380368e-06, + "objective/entropy": -104.08444213867188, + "objective/kl": 8.810539245605469, + "objective/non_score_reward": -0.8810538649559021, + "objective/rlhf_reward": -3.1242154896259304, + "objective/scores": 0.1, + "policy/approxkl_avg": 64.01618957519531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6541459560394287, + "step": 184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9997464418411255 + }, + { + "episode": 2976, + "epoch": 0.05349246863428839, + "loss/policy_avg": 0.7093124389648438, + "lr": 2.9645322085889574e-06, + "objective/entropy": -2.2389583587646484, + "objective/kl": 12.9491548538208, + "objective/non_score_reward": -1.2949154376983643, + "objective/rlhf_reward": -3.056955727116142, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 159.49282836914062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5769963264465332, + "step": 185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998090386390686 + }, + { + "episode": 2992, + "epoch": 0.053780062551677034, + "loss/policy_avg": 0.6172127723693848, + "lr": 2.964340490797546e-06, + "objective/entropy": 96.50690460205078, + "objective/kl": 9.217771530151367, + "objective/non_score_reward": -0.9217771291732788, + "objective/rlhf_reward": 0.7128913417458538, + "objective/scores": 1.1, + "policy/approxkl_avg": 116.59093475341797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5699312686920166, + "step": 186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982131719589233 + }, + { + "episode": 3008, + "epoch": 0.05406765646906568, + "loss/policy_avg": 0.1616460084915161, + "lr": 2.964148773006135e-06, + "objective/entropy": -13.621841430664062, + "objective/kl": 11.028844833374023, + "objective/non_score_reward": -1.1028845310211182, + "objective/rlhf_reward": -2.8074182084837727, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 125.17231750488281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6993151903152466, + "step": 187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992413520812988 + }, + { + "episode": 3024, + "epoch": 0.05435525038645433, + "loss/policy_avg": -0.022246820852160454, + "lr": 2.9639570552147242e-06, + "objective/entropy": -95.69093322753906, + "objective/kl": 8.957221031188965, + "objective/non_score_reward": -0.8957222700119019, + "objective/rlhf_reward": -3.1828890204429623, + "objective/scores": 0.1, + "policy/approxkl_avg": 129.6190948486328, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7190344333648682, + "step": 188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0001869201660156 + }, + { + "episode": 3040, + "epoch": 0.054642844303842975, + "loss/policy_avg": 0.3806031346321106, + "lr": 2.9637653374233126e-06, + "objective/entropy": 31.125537872314453, + "objective/kl": 14.289737701416016, + "objective/non_score_reward": -1.428973913192749, + "objective/rlhf_reward": -5.315895533561706, + "objective/scores": 0.1, + "policy/approxkl_avg": 252.6319122314453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4689924716949463, + "step": 189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998744010925293 + }, + { + "episode": 3056, + "epoch": 0.05493043822123162, + "loss/policy_avg": 0.38266557455062866, + "lr": 2.963573619631902e-06, + "objective/entropy": -70.03289794921875, + "objective/kl": 11.780672073364258, + "objective/non_score_reward": -1.1780673265457153, + "objective/rlhf_reward": -1.7885502024900646, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 211.99916076660156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.736452043056488, + "step": 190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998718023300171 + }, + { + "episode": 3072, + "epoch": 0.05521803213862027, + "loss/policy_avg": -0.1702420711517334, + "lr": 2.9633819018404906e-06, + "objective/entropy": -88.13114166259766, + "objective/kl": 5.207786560058594, + "objective/non_score_reward": -0.5207787156105042, + "objective/rlhf_reward": -0.13570352919572182, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 66.82286834716797, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6916394233703613, + "step": 191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0128471851348877 + }, + { + "episode": 3088, + "epoch": 0.055505626056008915, + "loss/policy_avg": 0.47809040546417236, + "lr": 2.96319018404908e-06, + "objective/entropy": -146.3225555419922, + "objective/kl": 9.557235717773438, + "objective/non_score_reward": -0.9557235240936279, + "objective/rlhf_reward": -1.8754830089973764, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 62.17351531982422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6510109901428223, + "step": 192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980770349502563 + }, + { + "episode": 3104, + "epoch": 0.05579321997339756, + "loss/policy_avg": 0.695833683013916, + "lr": 2.9629984662576687e-06, + "objective/entropy": 100.23960876464844, + "objective/kl": 9.5051908493042, + "objective/non_score_reward": -0.9505190849304199, + "objective/rlhf_reward": -3.4020764887332913, + "objective/scores": 0.1, + "policy/approxkl_avg": 92.06460571289062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7171235084533691, + "step": 193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999764084815979 + }, + { + "episode": 3120, + "epoch": 0.05608081389078621, + "loss/policy_avg": 0.16662254929542542, + "lr": 2.9628067484662575e-06, + "objective/entropy": 166.84793090820312, + "objective/kl": 12.172272682189941, + "objective/non_score_reward": -1.2172273397445679, + "objective/rlhf_reward": -4.4689091391861435, + "objective/scores": 0.1, + "policy/approxkl_avg": 133.61741638183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5888961553573608, + "step": 194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996709823608398 + }, + { + "episode": 3136, + "epoch": 0.056368407808174856, + "loss/policy_avg": 0.7391001582145691, + "lr": 2.9626150306748467e-06, + "objective/entropy": 7.757408142089844, + "objective/kl": 4.110318183898926, + "objective/non_score_reward": -0.4110318422317505, + "objective/rlhf_reward": 2.755872675776482, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.918201446533203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6208719611167908, + "step": 195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999298095703125 + }, + { + "episode": 3152, + "epoch": 0.0566560017255635, + "loss/policy_avg": -0.04099520295858383, + "lr": 2.9624233128834355e-06, + "objective/entropy": -164.4539337158203, + "objective/kl": 7.288479328155518, + "objective/non_score_reward": -0.7288479804992676, + "objective/rlhf_reward": 1.4846080929040912, + "objective/scores": 1.1, + "policy/approxkl_avg": 88.62603759765625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7371472120285034, + "step": 196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998602867126465 + }, + { + "episode": 3168, + "epoch": 0.05694359564295215, + "loss/policy_avg": 0.10230283439159393, + "lr": 2.9622315950920248e-06, + "objective/entropy": -6.653453826904297, + "objective/kl": 10.225364685058594, + "objective/non_score_reward": -1.0225365161895752, + "objective/rlhf_reward": -2.265317554744791, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 116.62469482421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6673995852470398, + "step": 197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9955484867095947 + }, + { + "episode": 3184, + "epoch": 0.057231189560340796, + "loss/policy_avg": 0.1610383689403534, + "lr": 2.9620398773006136e-06, + "objective/entropy": 83.79240417480469, + "objective/kl": 6.000893592834473, + "objective/non_score_reward": -0.6000893712043762, + "objective/rlhf_reward": -2.000357484817505, + "objective/scores": 0.1, + "policy/approxkl_avg": 38.31947326660156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49990415573120117, + "step": 198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981744289398193 + }, + { + "episode": 3200, + "epoch": 0.05751878347772944, + "loss/policy_avg": 2.8004019260406494, + "lr": 2.9618481595092024e-06, + "objective/entropy": 105.93596649169922, + "objective/kl": 9.928138732910156, + "objective/non_score_reward": -0.9928138852119446, + "objective/rlhf_reward": 0.4287445038557056, + "objective/scores": 1.1, + "policy/approxkl_avg": 68.9993667602539, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5279573202133179, + "step": 199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0012660026550293 + }, + { + "episode": 3216, + "epoch": 0.05780637739511809, + "loss/policy_avg": 0.1186942383646965, + "lr": 2.9616564417177916e-06, + "objective/entropy": 176.61386108398438, + "objective/kl": 15.481854438781738, + "objective/non_score_reward": -1.5481854677200317, + "objective/rlhf_reward": -8.192741394042969, + "objective/scores": -0.5, + "policy/approxkl_avg": 287.8005676269531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8114917278289795, + "step": 200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980297088623047 + }, + { + "episode": 3232, + "epoch": 0.058093971312506744, + "loss/policy_avg": 0.12549322843551636, + "lr": 2.9614647239263804e-06, + "objective/entropy": -43.08008575439453, + "objective/kl": 1.7479121685028076, + "objective/non_score_reward": -0.17479124665260315, + "objective/rlhf_reward": 1.1256637878987084, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 1.0369627475738525, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6038674116134644, + "step": 201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001643657684326 + }, + { + "episode": 3248, + "epoch": 0.05838156522989539, + "loss/policy_avg": 0.37325456738471985, + "lr": 2.9612730061349692e-06, + "objective/entropy": -55.67109680175781, + "objective/kl": 16.567649841308594, + "objective/non_score_reward": -1.6567649841308594, + "objective/rlhf_reward": -8.627059936523438, + "objective/scores": -0.5, + "policy/approxkl_avg": 345.91241455078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.688873291015625, + "step": 202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9956648349761963 + }, + { + "episode": 3264, + "epoch": 0.05866915914728404, + "loss/policy_avg": 0.4203230142593384, + "lr": 2.9610812883435585e-06, + "objective/entropy": -102.62028503417969, + "objective/kl": 8.865509033203125, + "objective/non_score_reward": -0.8865509629249573, + "objective/rlhf_reward": -0.622484960348579, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 93.6322021484375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6493798494338989, + "step": 203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986227750778198 + }, + { + "episode": 3280, + "epoch": 0.058956753064672685, + "loss/policy_avg": 0.6964031457901001, + "lr": 2.9608895705521473e-06, + "objective/entropy": 110.6749496459961, + "objective/kl": 17.85771369934082, + "objective/non_score_reward": -1.785771131515503, + "objective/rlhf_reward": -9.143084526062012, + "objective/scores": -0.5, + "policy/approxkl_avg": 490.62164306640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6564576625823975, + "step": 204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979726076126099 + }, + { + "episode": 3296, + "epoch": 0.05924434698206133, + "loss/policy_avg": 0.2527036964893341, + "lr": 2.9606978527607365e-06, + "objective/entropy": -116.59994506835938, + "objective/kl": 11.40339469909668, + "objective/non_score_reward": -1.1403393745422363, + "objective/rlhf_reward": -2.161357662081718, + "objective/scores": 0.6, + "policy/approxkl_avg": 91.01628112792969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6173946857452393, + "step": 205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9987362623214722 + }, + { + "episode": 3312, + "epoch": 0.05953194089944998, + "loss/policy_avg": 0.2560387849807739, + "lr": 2.9605061349693253e-06, + "objective/entropy": -39.499755859375, + "objective/kl": 6.948145866394043, + "objective/non_score_reward": -0.6948145627975464, + "objective/rlhf_reward": 1.6207415699958805, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.67678833007812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6708812713623047, + "step": 206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997892141342163 + }, + { + "episode": 3328, + "epoch": 0.059819534816838625, + "loss/policy_avg": 0.27111512422561646, + "lr": 2.960314417177914e-06, + "objective/entropy": 84.36082458496094, + "objective/kl": 7.624824523925781, + "objective/non_score_reward": -0.7624824643135071, + "objective/rlhf_reward": -5.049929618835449, + "objective/scores": -0.5, + "policy/approxkl_avg": 129.3857879638672, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.859719455242157, + "step": 207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986274242401123 + }, + { + "episode": 3344, + "epoch": 0.06010712873422727, + "loss/policy_avg": 0.3739089071750641, + "lr": 2.9601226993865034e-06, + "objective/entropy": -137.59747314453125, + "objective/kl": 7.363832950592041, + "objective/non_score_reward": -0.7363832592964172, + "objective/rlhf_reward": -2.5455331265926358, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.08015441894531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6479380130767822, + "step": 208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974188804626465 + }, + { + "episode": 3360, + "epoch": 0.06039472265161592, + "loss/policy_avg": 0.2446056306362152, + "lr": 2.959930981595092e-06, + "objective/entropy": 87.66815185546875, + "objective/kl": 6.449171543121338, + "objective/non_score_reward": -0.6449171900749207, + "objective/rlhf_reward": -2.1796687602996823, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.2250862121582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4830004572868347, + "step": 209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996240139007568 + }, + { + "episode": 3376, + "epoch": 0.060682316569004566, + "loss/policy_avg": 0.19204621016979218, + "lr": 2.9597392638036814e-06, + "objective/entropy": 67.65581512451172, + "objective/kl": 9.747137069702148, + "objective/non_score_reward": -0.9747136831283569, + "objective/rlhf_reward": -3.4988546952605244, + "objective/scores": 0.1, + "policy/approxkl_avg": 113.63888549804688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5157696604728699, + "step": 210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9994913339614868 + }, + { + "episode": 3392, + "epoch": 0.06096991048639321, + "loss/policy_avg": -0.3198900520801544, + "lr": 2.9595475460122702e-06, + "objective/entropy": 64.95191955566406, + "objective/kl": 6.794856071472168, + "objective/non_score_reward": -0.6794856190681458, + "objective/rlhf_reward": -2.317942655086517, + "objective/scores": 0.1, + "policy/approxkl_avg": 64.20761108398438, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6942774653434753, + "step": 211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.003568649291992 + }, + { + "episode": 3408, + "epoch": 0.06125750440378186, + "loss/policy_avg": 0.8114681839942932, + "lr": 2.959355828220859e-06, + "objective/entropy": 88.22362518310547, + "objective/kl": 10.00731372833252, + "objective/non_score_reward": -1.0007314682006836, + "objective/rlhf_reward": 0.39707423150539434, + "objective/scores": 1.1, + "policy/approxkl_avg": 100.05908203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5509470105171204, + "step": 212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995248317718506 + }, + { + "episode": 3424, + "epoch": 0.061545098321170506, + "loss/policy_avg": 0.3652896285057068, + "lr": 2.959164110429448e-06, + "objective/entropy": -220.09613037109375, + "objective/kl": 8.2984037399292, + "objective/non_score_reward": -0.8298404216766357, + "objective/rlhf_reward": -2.919361627101898, + "objective/scores": 0.1, + "policy/approxkl_avg": 93.64591979980469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5828021168708801, + "step": 213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000081777572632 + }, + { + "episode": 3440, + "epoch": 0.06183269223855915, + "loss/policy_avg": 0.028781473636627197, + "lr": 2.9589723926380366e-06, + "objective/entropy": 72.11604309082031, + "objective/kl": 2.359449625015259, + "objective/non_score_reward": -0.2359449863433838, + "objective/rlhf_reward": -0.5437799677252769, + "objective/scores": 0.1, + "policy/approxkl_avg": 0.3476608991622925, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4451746940612793, + "step": 214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0019702911376953 + }, + { + "episode": 3456, + "epoch": 0.0621202861559478, + "loss/policy_avg": 0.5238405466079712, + "lr": 2.958780674846626e-06, + "objective/entropy": 94.75743103027344, + "objective/kl": 12.506973266601562, + "objective/non_score_reward": -1.250697374343872, + "objective/rlhf_reward": -4.602789735794067, + "objective/scores": 0.1, + "policy/approxkl_avg": 134.81378173828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6946749091148376, + "step": 215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987130165100098 + }, + { + "episode": 3472, + "epoch": 0.06240788007333645, + "loss/policy_avg": 0.357485830783844, + "lr": 2.9585889570552147e-06, + "objective/entropy": -56.58507537841797, + "objective/kl": 15.03990364074707, + "objective/non_score_reward": -1.5039904117584229, + "objective/rlhf_reward": -4.068550298886235, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 188.71969604492188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5828025341033936, + "step": 216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9974546432495117 + }, + { + "episode": 3488, + "epoch": 0.0626954739907251, + "loss/policy_avg": 0.5269087553024292, + "lr": 2.9583972392638035e-06, + "objective/entropy": -54.57160949707031, + "objective/kl": 10.709373474121094, + "objective/non_score_reward": -1.0709375143051147, + "objective/rlhf_reward": -6.283750057220459, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.47734069824219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4404389560222626, + "step": 217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983222484588623 + }, + { + "episode": 3504, + "epoch": 0.06298306790811374, + "loss/policy_avg": 0.17589987814426422, + "lr": 2.9582055214723927e-06, + "objective/entropy": 64.53421020507812, + "objective/kl": 7.162571907043457, + "objective/non_score_reward": -0.7162571549415588, + "objective/rlhf_reward": -0.7423223874726631, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 121.216552734375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4267624020576477, + "step": 218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995992183685303 + }, + { + "episode": 3520, + "epoch": 0.0632706618255024, + "loss/policy_avg": 0.26339495182037354, + "lr": 2.9580138036809815e-06, + "objective/entropy": 229.096435546875, + "objective/kl": 11.569236755371094, + "objective/non_score_reward": -1.156923532485962, + "objective/rlhf_reward": -1.7039751603615017, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 217.75863647460938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7306234836578369, + "step": 219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9954323768615723 + }, + { + "episode": 3536, + "epoch": 0.06355825574289103, + "loss/policy_avg": 0.14159545302391052, + "lr": 2.9578220858895708e-06, + "objective/entropy": -69.43865966796875, + "objective/kl": 5.585199356079102, + "objective/non_score_reward": -0.5585199594497681, + "objective/rlhf_reward": -0.11137341179040439, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 20.743200302124023, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7005258202552795, + "step": 220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997767448425293 + }, + { + "episode": 3552, + "epoch": 0.06384584966027969, + "loss/policy_avg": -0.5519238710403442, + "lr": 2.9576303680981596e-06, + "objective/entropy": 208.38470458984375, + "objective/kl": 6.988656997680664, + "objective/non_score_reward": -0.6988657712936401, + "objective/rlhf_reward": -2.395462906360626, + "objective/scores": 0.1, + "policy/approxkl_avg": 93.66447448730469, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7662152051925659, + "step": 221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0369105339050293 + }, + { + "episode": 3568, + "epoch": 0.06413344357766833, + "loss/policy_avg": 0.047732796519994736, + "lr": 2.9574386503067484e-06, + "objective/entropy": 143.40084838867188, + "objective/kl": 11.851188659667969, + "objective/non_score_reward": -1.1851186752319336, + "objective/rlhf_reward": -2.340474939346313, + "objective/scores": 0.6, + "policy/approxkl_avg": 151.91505432128906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7239178419113159, + "step": 222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001242160797119 + }, + { + "episode": 3584, + "epoch": 0.06442103749505698, + "loss/policy_avg": 0.7117222547531128, + "lr": 2.9572469325153376e-06, + "objective/entropy": -145.98013305664062, + "objective/kl": 12.114925384521484, + "objective/non_score_reward": -1.2114924192428589, + "objective/rlhf_reward": -3.1126364628473913, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 156.07591247558594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7104471921920776, + "step": 223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9960601329803467 + }, + { + "episode": 3600, + "epoch": 0.06470863141244562, + "loss/policy_avg": -0.4437275230884552, + "lr": 2.9570552147239264e-06, + "objective/entropy": 7.181800842285156, + "objective/kl": 4.8645524978637695, + "objective/non_score_reward": -0.4864552319049835, + "objective/rlhf_reward": -1.5458209127187728, + "objective/scores": 0.1, + "policy/approxkl_avg": 67.22005462646484, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.3355029821395874, + "step": 224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003509044647217 + }, + { + "episode": 3616, + "epoch": 0.06499622532983428, + "loss/policy_avg": 0.09265188872814178, + "lr": 2.9568634969325152e-06, + "objective/entropy": -261.4568176269531, + "objective/kl": 8.175820350646973, + "objective/non_score_reward": -0.8175821304321289, + "objective/rlhf_reward": -2.870328368991613, + "objective/scores": 0.1, + "policy/approxkl_avg": 53.4001350402832, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6872222423553467, + "step": 225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999666690826416 + }, + { + "episode": 3632, + "epoch": 0.06528381924722292, + "loss/policy_avg": 0.27531012892723083, + "lr": 2.9566717791411045e-06, + "objective/entropy": 15.044029235839844, + "objective/kl": 10.096330642700195, + "objective/non_score_reward": -1.0096330642700195, + "objective/rlhf_reward": -6.038532257080078, + "objective/scores": -0.5, + "policy/approxkl_avg": 120.44912719726562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7716231346130371, + "step": 226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978790283203125 + }, + { + "episode": 3648, + "epoch": 0.06557141316461157, + "loss/policy_avg": 0.27537640929222107, + "lr": 2.9564800613496933e-06, + "objective/entropy": 171.693359375, + "objective/kl": 13.582971572875977, + "objective/non_score_reward": -1.3582972288131714, + "objective/rlhf_reward": -2.5094699307691783, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 178.1964111328125, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6318575143814087, + "step": 227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985480308532715 + }, + { + "episode": 3664, + "epoch": 0.06585900708200021, + "loss/policy_avg": 0.5710182785987854, + "lr": 2.9562883435582825e-06, + "objective/entropy": 163.5762939453125, + "objective/kl": 11.832403182983398, + "objective/non_score_reward": -1.1832401752471924, + "objective/rlhf_reward": -4.332961043715477, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.51448059082031, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7185041904449463, + "step": 228, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0005269050598145 + }, + { + "episode": 3680, + "epoch": 0.06614660099938886, + "loss/policy_avg": 0.2001960575580597, + "lr": 2.9560966257668713e-06, + "objective/entropy": -31.42165756225586, + "objective/kl": 11.342616081237793, + "objective/non_score_reward": -1.1342616081237793, + "objective/rlhf_reward": -2.5896352929639175, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 163.35409545898438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5161740779876709, + "step": 229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981510639190674 + }, + { + "episode": 3696, + "epoch": 0.06643419491677752, + "loss/policy_avg": 0.033837247639894485, + "lr": 2.95590490797546e-06, + "objective/entropy": -127.0381088256836, + "objective/kl": 12.982643127441406, + "objective/non_score_reward": -1.298264503479004, + "objective/rlhf_reward": -7.193058013916016, + "objective/scores": -0.5, + "policy/approxkl_avg": 175.97671508789062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7343233823776245, + "step": 230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994841814041138 + }, + { + "episode": 3712, + "epoch": 0.06672178883416616, + "loss/policy_avg": 0.3433837890625, + "lr": 2.9557131901840494e-06, + "objective/entropy": 68.5723876953125, + "objective/kl": 13.85904598236084, + "objective/non_score_reward": -1.3859045505523682, + "objective/rlhf_reward": -3.8102850029865896, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 221.48330688476562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7557601928710938, + "step": 231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970917701721191 + }, + { + "episode": 3728, + "epoch": 0.06700938275155481, + "loss/policy_avg": 0.2773955464363098, + "lr": 2.955521472392638e-06, + "objective/entropy": -77.472412109375, + "objective/kl": 8.535304069519043, + "objective/non_score_reward": -0.8535304069519043, + "objective/rlhf_reward": -0.4904027923357215, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 31.26820182800293, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.47358882427215576, + "step": 232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977991580963135 + }, + { + "episode": 3744, + "epoch": 0.06729697666894345, + "loss/policy_avg": 0.419773131608963, + "lr": 2.9553297546012274e-06, + "objective/entropy": 151.9324951171875, + "objective/kl": 10.881217956542969, + "objective/non_score_reward": -1.0881218910217285, + "objective/rlhf_reward": -2.4050763649510696, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 90.420166015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7324544787406921, + "step": 233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985604286193848 + }, + { + "episode": 3760, + "epoch": 0.0675845705863321, + "loss/policy_avg": 0.19571278989315033, + "lr": 2.955138036809816e-06, + "objective/entropy": 164.4075927734375, + "objective/kl": 9.931008338928223, + "objective/non_score_reward": -0.9931010007858276, + "objective/rlhf_reward": -1.8496975473323207, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 90.80259704589844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4490511417388916, + "step": 234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9968398809432983 + }, + { + "episode": 3776, + "epoch": 0.06787216450372074, + "loss/policy_avg": -0.0029441099613904953, + "lr": 2.954946319018405e-06, + "objective/entropy": -57.89764404296875, + "objective/kl": 11.663187026977539, + "objective/non_score_reward": -1.1663187742233276, + "objective/rlhf_reward": -2.265275067090988, + "objective/scores": 0.6, + "policy/approxkl_avg": 56.57416915893555, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6279686689376831, + "step": 235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994487762451172 + }, + { + "episode": 3792, + "epoch": 0.0681597584211094, + "loss/policy_avg": 0.19873343408107758, + "lr": 2.954754601226994e-06, + "objective/entropy": 69.95574951171875, + "objective/kl": 2.667611598968506, + "objective/non_score_reward": -0.2667612135410309, + "objective/rlhf_reward": -3.067044734954834, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.46300506591797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6580133438110352, + "step": 236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998089075088501 + }, + { + "episode": 3808, + "epoch": 0.06844735233849804, + "loss/policy_avg": 0.06835653632879257, + "lr": 2.9545628834355827e-06, + "objective/entropy": -18.042882919311523, + "objective/kl": 10.576539993286133, + "objective/non_score_reward": -1.0576539039611816, + "objective/rlhf_reward": -6.230615615844727, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.800323486328125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.38686278462409973, + "step": 237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9957520961761475 + }, + { + "episode": 3824, + "epoch": 0.06873494625588669, + "loss/policy_avg": 0.09460563957691193, + "lr": 2.954371165644172e-06, + "objective/entropy": -168.65773010253906, + "objective/kl": 9.4718017578125, + "objective/non_score_reward": -0.9471801519393921, + "objective/rlhf_reward": -3.388720667362213, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.32270812988281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3954545259475708, + "step": 238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000286817550659 + }, + { + "episode": 3840, + "epoch": 0.06902254017327533, + "loss/policy_avg": -0.28287017345428467, + "lr": 2.9541794478527607e-06, + "objective/entropy": 191.8770751953125, + "objective/kl": 5.481754302978516, + "objective/non_score_reward": -0.5481754541397095, + "objective/rlhf_reward": -4.192701816558838, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.588523864746094, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4681757688522339, + "step": 239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001433849334717 + }, + { + "episode": 3856, + "epoch": 0.06931013409066399, + "loss/policy_avg": 0.4215943217277527, + "lr": 2.9539877300613495e-06, + "objective/entropy": 122.14271545410156, + "objective/kl": 11.92599868774414, + "objective/non_score_reward": -1.1926000118255615, + "objective/rlhf_reward": -4.370399898290634, + "objective/scores": 0.1, + "policy/approxkl_avg": 73.78562927246094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6994770765304565, + "step": 240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991042613983154 + }, + { + "episode": 3872, + "epoch": 0.06959772800805263, + "loss/policy_avg": 0.14229349792003632, + "lr": 2.9537960122699387e-06, + "objective/entropy": 97.91790008544922, + "objective/kl": 11.320087432861328, + "objective/non_score_reward": -1.1320087909698486, + "objective/rlhf_reward": -0.1280353426933285, + "objective/scores": 1.1, + "policy/approxkl_avg": 53.905311584472656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7801535129547119, + "step": 241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0005578994750977 + }, + { + "episode": 3888, + "epoch": 0.06988532192544128, + "loss/policy_avg": 0.6044175624847412, + "lr": 2.9536042944785275e-06, + "objective/entropy": -45.497032165527344, + "objective/kl": 14.7174072265625, + "objective/non_score_reward": -1.47174072265625, + "objective/rlhf_reward": -3.9395517510938003, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 80.24563598632812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4917639493942261, + "step": 242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982421398162842 + }, + { + "episode": 3904, + "epoch": 0.07017291584282992, + "loss/policy_avg": 0.37928763031959534, + "lr": 2.9534125766871168e-06, + "objective/entropy": 85.73689270019531, + "objective/kl": 3.559020519256592, + "objective/non_score_reward": -0.3559020459651947, + "objective/rlhf_reward": 2.9763918161392215, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.9539512395858765, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5831518173217773, + "step": 243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.023195743560791 + }, + { + "episode": 3920, + "epoch": 0.07046050976021857, + "loss/policy_avg": 0.06145331636071205, + "lr": 2.9532208588957056e-06, + "objective/entropy": -83.13604736328125, + "objective/kl": 10.009160995483398, + "objective/non_score_reward": -1.0009161233901978, + "objective/rlhf_reward": -6.003664016723633, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.118873596191406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7157425880432129, + "step": 244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9978001117706299 + }, + { + "episode": 3936, + "epoch": 0.07074810367760721, + "loss/policy_avg": 0.34171396493911743, + "lr": 2.9530291411042944e-06, + "objective/entropy": -138.12054443359375, + "objective/kl": 13.933228492736816, + "objective/non_score_reward": -1.3933229446411133, + "objective/rlhf_reward": -2.649572645069334, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 283.137939453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8090240955352783, + "step": 245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.99606454372406 + }, + { + "episode": 3952, + "epoch": 0.07103569759499587, + "loss/policy_avg": 0.05803845077753067, + "lr": 2.9528374233128836e-06, + "objective/entropy": -0.7659759521484375, + "objective/kl": 10.326383590698242, + "objective/non_score_reward": -1.0326383113861084, + "objective/rlhf_reward": -3.730553215742111, + "objective/scores": 0.1, + "policy/approxkl_avg": 82.2501449584961, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6106427311897278, + "step": 246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0012848377227783 + }, + { + "episode": 3968, + "epoch": 0.0713232915123845, + "loss/policy_avg": 0.27074679732322693, + "lr": 2.9526457055214724e-06, + "objective/entropy": -100.07415771484375, + "objective/kl": 9.287663459777832, + "objective/non_score_reward": -0.9287664294242859, + "objective/rlhf_reward": -1.592359433249507, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 4.526418209075928, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5503559112548828, + "step": 247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0001773834228516 + }, + { + "episode": 3984, + "epoch": 0.07161088542977316, + "loss/policy_avg": 0.1150532141327858, + "lr": 2.9524539877300617e-06, + "objective/entropy": 37.045387268066406, + "objective/kl": 9.54155158996582, + "objective/non_score_reward": -0.9541550874710083, + "objective/rlhf_reward": 0.5833795011043552, + "objective/scores": 1.1, + "policy/approxkl_avg": 118.93629455566406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7818096876144409, + "step": 248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984393119812012 + }, + { + "episode": 4000, + "epoch": 0.07189847934716181, + "loss/policy_avg": 0.11583375930786133, + "lr": 2.9522622699386505e-06, + "objective/entropy": -29.975513458251953, + "objective/kl": 11.691999435424805, + "objective/non_score_reward": -1.169199824333191, + "objective/rlhf_reward": -4.276799207925796, + "objective/scores": 0.1, + "policy/approxkl_avg": 95.38002014160156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6541903018951416, + "step": 249, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976664781570435 + }, + { + "episode": 4016, + "epoch": 0.07218607326455045, + "loss/policy_avg": 0.424482524394989, + "lr": 2.9520705521472393e-06, + "objective/entropy": 108.46614074707031, + "objective/kl": 7.699254989624023, + "objective/non_score_reward": -0.7699254751205444, + "objective/rlhf_reward": -2.679701870679855, + "objective/scores": 0.1, + "policy/approxkl_avg": 38.84117126464844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9610189199447632, + "step": 250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0005178451538086 + }, + { + "episode": 4032, + "epoch": 0.07247366718193911, + "loss/policy_avg": -0.013865754008293152, + "lr": 2.9518788343558285e-06, + "objective/entropy": 2.4157180786132812, + "objective/kl": 8.162057876586914, + "objective/non_score_reward": -0.8162057399749756, + "objective/rlhf_reward": -5.264822959899902, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.96641540527344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6327460408210754, + "step": 251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980905055999756 + }, + { + "episode": 4048, + "epoch": 0.07276126109932775, + "loss/policy_avg": 0.12326370179653168, + "lr": 2.9516871165644173e-06, + "objective/entropy": -18.136714935302734, + "objective/kl": 15.872564315795898, + "objective/non_score_reward": -1.5872564315795898, + "objective/rlhf_reward": -8.34902572631836, + "objective/scores": -0.5, + "policy/approxkl_avg": 183.87185668945312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.614548921585083, + "step": 252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986623525619507 + }, + { + "episode": 4064, + "epoch": 0.0730488550167164, + "loss/policy_avg": 0.42759251594543457, + "lr": 2.951495398773006e-06, + "objective/entropy": 187.13995361328125, + "objective/kl": 15.658686637878418, + "objective/non_score_reward": -1.5658683776855469, + "objective/rlhf_reward": -5.863473868370056, + "objective/scores": 0.1, + "policy/approxkl_avg": 77.336669921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7136948704719543, + "step": 253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998879075050354 + }, + { + "episode": 4080, + "epoch": 0.07333644893410504, + "loss/policy_avg": 0.45976442098617554, + "lr": 2.9513036809815954e-06, + "objective/entropy": -140.05638122558594, + "objective/kl": 3.995250940322876, + "objective/non_score_reward": -0.39952513575553894, + "objective/rlhf_reward": 2.8018994905054573, + "objective/scores": 1.1, + "policy/approxkl_avg": 19.618144989013672, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5429213047027588, + "step": 254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998197078704834 + }, + { + "episode": 4096, + "epoch": 0.0736240428514937, + "loss/policy_avg": -0.02419188618659973, + "lr": 2.951111963190184e-06, + "objective/entropy": -150.40243530273438, + "objective/kl": 5.681851387023926, + "objective/non_score_reward": -0.5681850910186768, + "objective/rlhf_reward": -4.272740364074707, + "objective/scores": -0.5, + "policy/approxkl_avg": 25.766756057739258, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49843645095825195, + "step": 255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990272521972656 + }, + { + "episode": 4112, + "epoch": 0.07391163676888234, + "loss/policy_avg": 0.09387945383787155, + "lr": 2.950920245398773e-06, + "objective/entropy": 21.77161407470703, + "objective/kl": 8.116277694702148, + "objective/non_score_reward": -0.8116278648376465, + "objective/rlhf_reward": -5.246511459350586, + "objective/scores": -0.5, + "policy/approxkl_avg": 51.24212646484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.709270715713501, + "step": 256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979963302612305 + }, + { + "episode": 4128, + "epoch": 0.07419923068627099, + "loss/policy_avg": -0.4069734215736389, + "lr": 2.950728527607362e-06, + "objective/entropy": -174.08395385742188, + "objective/kl": 8.566609382629395, + "objective/non_score_reward": -0.8566610217094421, + "objective/rlhf_reward": -1.7647844008809193, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 61.56124496459961, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4532226324081421, + "step": 257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002131938934326 + }, + { + "episode": 4144, + "epoch": 0.07448682460365963, + "loss/policy_avg": 0.04940726235508919, + "lr": 2.950536809815951e-06, + "objective/entropy": 32.93096923828125, + "objective/kl": 13.826756477355957, + "objective/non_score_reward": -1.3826756477355957, + "objective/rlhf_reward": -7.530702590942383, + "objective/scores": -0.5, + "policy/approxkl_avg": 152.21774291992188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4292905330657959, + "step": 258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9970979690551758 + }, + { + "episode": 4160, + "epoch": 0.07477441852104828, + "loss/policy_avg": 0.08529473841190338, + "lr": 2.95034509202454e-06, + "objective/entropy": 14.531261444091797, + "objective/kl": 6.426525115966797, + "objective/non_score_reward": -0.6426525712013245, + "objective/rlhf_reward": 1.8293897002935413, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.471156597137451, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3390624523162842, + "step": 259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988377094268799 + }, + { + "episode": 4176, + "epoch": 0.07506201243843692, + "loss/policy_avg": 0.6565670371055603, + "lr": 2.9501533742331287e-06, + "objective/entropy": 13.341388702392578, + "objective/kl": 13.047385215759277, + "objective/non_score_reward": -1.3047385215759277, + "objective/rlhf_reward": -3.0962477944054942, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 156.30238342285156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6474744081497192, + "step": 260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990012645721436 + }, + { + "episode": 4192, + "epoch": 0.07534960635582558, + "loss/policy_avg": 0.8194575309753418, + "lr": 2.949961656441718e-06, + "objective/entropy": 39.95905303955078, + "objective/kl": 11.301864624023438, + "objective/non_score_reward": -1.1301864385604858, + "objective/rlhf_reward": -6.520745754241943, + "objective/scores": -0.5, + "policy/approxkl_avg": 100.30358123779297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6240185499191284, + "step": 261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996020793914795 + }, + { + "episode": 4208, + "epoch": 0.07563720027321422, + "loss/policy_avg": 0.9927411079406738, + "lr": 2.9497699386503067e-06, + "objective/entropy": 214.742431640625, + "objective/kl": 10.751565933227539, + "objective/non_score_reward": -1.075156569480896, + "objective/rlhf_reward": -3.900626084208488, + "objective/scores": 0.1, + "policy/approxkl_avg": 190.6407928466797, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5556049346923828, + "step": 262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008435249328613 + }, + { + "episode": 4224, + "epoch": 0.07592479419060287, + "loss/policy_avg": 0.1370810866355896, + "lr": 2.949578220858896e-06, + "objective/entropy": -30.638282775878906, + "objective/kl": 4.496548652648926, + "objective/non_score_reward": -0.4496549069881439, + "objective/rlhf_reward": 0.14879166059023552, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.034202575683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6356778740882874, + "step": 263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985594749450684 + }, + { + "episode": 4240, + "epoch": 0.07621238810799151, + "loss/policy_avg": 0.6080716848373413, + "lr": 2.9493865030674847e-06, + "objective/entropy": -11.881404876708984, + "objective/kl": 10.906417846679688, + "objective/non_score_reward": -1.0906418561935425, + "objective/rlhf_reward": -2.5377384528246627, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 79.76661682128906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5296034812927246, + "step": 264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986059665679932 + }, + { + "episode": 4256, + "epoch": 0.07649998202538016, + "loss/policy_avg": 0.21646641194820404, + "lr": 2.9491947852760736e-06, + "objective/entropy": 142.3988800048828, + "objective/kl": 6.696286201477051, + "objective/non_score_reward": -0.6696287393569946, + "objective/rlhf_reward": 1.7214852511882786, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.300674438476562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6062541007995605, + "step": 265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001516342163086 + }, + { + "episode": 4272, + "epoch": 0.0767875759427688, + "loss/policy_avg": 0.13891346752643585, + "lr": 2.9490030674846628e-06, + "objective/entropy": -61.452545166015625, + "objective/kl": 5.754723072052002, + "objective/non_score_reward": -0.575472354888916, + "objective/rlhf_reward": 0.09811072945594779, + "objective/scores": 0.6, + "policy/approxkl_avg": 18.09847068786621, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.41696321964263916, + "step": 266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999955177307129 + }, + { + "episode": 4288, + "epoch": 0.07707516986015746, + "loss/policy_avg": 0.19311824440956116, + "lr": 2.9488113496932516e-06, + "objective/entropy": 32.384788513183594, + "objective/kl": 10.006759643554688, + "objective/non_score_reward": -1.000675916671753, + "objective/rlhf_reward": -3.6027039051055905, + "objective/scores": 0.1, + "policy/approxkl_avg": 50.9388427734375, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.571370542049408, + "step": 267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998531460762024 + }, + { + "episode": 4304, + "epoch": 0.0773627637775461, + "loss/policy_avg": -0.14073559641838074, + "lr": 2.9486196319018404e-06, + "objective/entropy": 85.10265350341797, + "objective/kl": 15.816366195678711, + "objective/non_score_reward": -1.5816365480422974, + "objective/rlhf_reward": -5.9265462219715115, + "objective/scores": 0.1, + "policy/approxkl_avg": 122.70755004882812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.756123423576355, + "step": 268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997191429138184 + }, + { + "episode": 4320, + "epoch": 0.07765035769493475, + "loss/policy_avg": 0.17501243948936462, + "lr": 2.9484279141104296e-06, + "objective/entropy": 93.92215728759766, + "objective/kl": 9.422307968139648, + "objective/non_score_reward": -0.9422306418418884, + "objective/rlhf_reward": -3.368922537565231, + "objective/scores": 0.1, + "policy/approxkl_avg": 55.133872985839844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5802706480026245, + "step": 269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981038570404053 + }, + { + "episode": 4336, + "epoch": 0.0779379516123234, + "loss/policy_avg": 0.4402735233306885, + "lr": 2.9482361963190184e-06, + "objective/entropy": -10.538581848144531, + "objective/kl": 10.759380340576172, + "objective/non_score_reward": -1.075938105583191, + "objective/rlhf_reward": -2.6418928853875263, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 119.47884368896484, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5788609385490417, + "step": 270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9987006187438965 + }, + { + "episode": 4352, + "epoch": 0.07822554552971205, + "loss/policy_avg": 0.582582414150238, + "lr": 2.9480444785276077e-06, + "objective/entropy": -98.20463562011719, + "objective/kl": 9.743408203125, + "objective/non_score_reward": -0.9743408560752869, + "objective/rlhf_reward": 0.5026364937424663, + "objective/scores": 1.1, + "policy/approxkl_avg": 72.30321502685547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6191748380661011, + "step": 271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998560905456543 + }, + { + "episode": 4368, + "epoch": 0.0785131394471007, + "loss/policy_avg": 0.6289035081863403, + "lr": 2.9478527607361965e-06, + "objective/entropy": -71.34103393554688, + "objective/kl": 12.412795066833496, + "objective/non_score_reward": -1.2412794828414917, + "objective/rlhf_reward": -4.565117752552032, + "objective/scores": 0.1, + "policy/approxkl_avg": 164.9723358154297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.736472487449646, + "step": 272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970333576202393 + }, + { + "episode": 4384, + "epoch": 0.07880073336448934, + "loss/policy_avg": 0.21773582696914673, + "lr": 2.9476610429447853e-06, + "objective/entropy": -0.68145751953125, + "objective/kl": 13.457925796508789, + "objective/non_score_reward": -1.3457924127578735, + "objective/rlhf_reward": -4.98316973298788, + "objective/scores": 0.1, + "policy/approxkl_avg": 254.5444793701172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8520537614822388, + "step": 273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983816146850586 + }, + { + "episode": 4400, + "epoch": 0.079088327281878, + "loss/policy_avg": 0.7082804441452026, + "lr": 2.9474693251533745e-06, + "objective/entropy": -72.47603607177734, + "objective/kl": 11.576058387756348, + "objective/non_score_reward": -1.1576058864593506, + "objective/rlhf_reward": -6.630423545837402, + "objective/scores": -0.5, + "policy/approxkl_avg": 119.74085235595703, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6347866058349609, + "step": 274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995064735412598 + }, + { + "episode": 4416, + "epoch": 0.07937592119926663, + "loss/policy_avg": -0.11256889998912811, + "lr": 2.9472776073619633e-06, + "objective/entropy": -83.20799255371094, + "objective/kl": 3.853982925415039, + "objective/non_score_reward": -0.385398268699646, + "objective/rlhf_reward": 2.8584069401025776, + "objective/scores": 1.1, + "policy/approxkl_avg": 0.9227147102355957, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.39738836884498596, + "step": 275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0011487007141113 + }, + { + "episode": 4432, + "epoch": 0.07966351511665529, + "loss/policy_avg": 0.32202059030532837, + "lr": 2.947085889570552e-06, + "objective/entropy": 71.21481323242188, + "objective/kl": 10.407328605651855, + "objective/non_score_reward": -1.0407328605651855, + "objective/rlhf_reward": -6.162931442260742, + "objective/scores": -0.5, + "policy/approxkl_avg": 108.96805572509766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5844370126724243, + "step": 276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993963241577148 + }, + { + "episode": 4448, + "epoch": 0.07995110903404393, + "loss/policy_avg": -0.023769661784172058, + "lr": 2.9468941717791414e-06, + "objective/entropy": -13.402759552001953, + "objective/kl": 12.098983764648438, + "objective/non_score_reward": -1.2098984718322754, + "objective/rlhf_reward": -6.839593887329102, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.1106595993042, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7059996724128723, + "step": 277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998667240142822 + }, + { + "episode": 4464, + "epoch": 0.08023870295143258, + "loss/policy_avg": 0.3893451690673828, + "lr": 2.9467024539877298e-06, + "objective/entropy": 189.41259765625, + "objective/kl": 6.5235137939453125, + "objective/non_score_reward": -0.6523513793945312, + "objective/rlhf_reward": -2.20940545797348, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.074134826660156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.798334002494812, + "step": 278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0013234615325928 + }, + { + "episode": 4480, + "epoch": 0.08052629686882122, + "loss/policy_avg": 0.0914541631937027, + "lr": 2.946510736196319e-06, + "objective/entropy": 84.65312194824219, + "objective/kl": 10.466255187988281, + "objective/non_score_reward": -1.0466254949569702, + "objective/rlhf_reward": 0.21349813938140905, + "objective/scores": 1.1, + "policy/approxkl_avg": 125.91380310058594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7349311113357544, + "step": 279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984486103057861 + }, + { + "episode": 4496, + "epoch": 0.08081389078620987, + "loss/policy_avg": 0.26043936610221863, + "lr": 2.946319018404908e-06, + "objective/entropy": -58.848392486572266, + "objective/kl": 10.368853569030762, + "objective/non_score_reward": -1.0368852615356445, + "objective/rlhf_reward": -1.7475412249565123, + "objective/scores": 0.6, + "policy/approxkl_avg": 51.58127975463867, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.612343430519104, + "step": 280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989829063415527 + }, + { + "episode": 4512, + "epoch": 0.08110148470359851, + "loss/policy_avg": 0.2319001704454422, + "lr": 2.946127300613497e-06, + "objective/entropy": -36.56258773803711, + "objective/kl": 9.824468612670898, + "objective/non_score_reward": -0.9824467897415161, + "objective/rlhf_reward": -5.9297871589660645, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.27013397216797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7777178883552551, + "step": 281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996426105499268 + }, + { + "episode": 4528, + "epoch": 0.08138907862098717, + "loss/policy_avg": 0.2995202839374542, + "lr": 2.945935582822086e-06, + "objective/entropy": 48.848323822021484, + "objective/kl": 16.30365753173828, + "objective/non_score_reward": -1.6303660869598389, + "objective/rlhf_reward": -6.1214640274643894, + "objective/scores": 0.1, + "policy/approxkl_avg": 165.35614013671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6986551284790039, + "step": 282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979193210601807 + }, + { + "episode": 4544, + "epoch": 0.08167667253837581, + "loss/policy_avg": 0.47233566641807556, + "lr": 2.9457438650306747e-06, + "objective/entropy": 59.0998420715332, + "objective/kl": 12.852258682250977, + "objective/non_score_reward": -1.2852261066436768, + "objective/rlhf_reward": -7.140904426574707, + "objective/scores": -0.5, + "policy/approxkl_avg": 134.68528747558594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.42645326256752014, + "step": 283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972941875457764 + }, + { + "episode": 4560, + "epoch": 0.08196426645576446, + "loss/policy_avg": 0.36607012152671814, + "lr": 2.945552147239264e-06, + "objective/entropy": 53.86030578613281, + "objective/kl": 5.768060684204102, + "objective/non_score_reward": -0.5768060684204102, + "objective/rlhf_reward": 2.0927755475044254, + "objective/scores": 1.1, + "policy/approxkl_avg": 36.973777770996094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7312701940536499, + "step": 284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9965434074401855 + }, + { + "episode": 4576, + "epoch": 0.0822518603731531, + "loss/policy_avg": -0.048812031745910645, + "lr": 2.9453604294478527e-06, + "objective/entropy": 65.61720275878906, + "objective/kl": 8.64478588104248, + "objective/non_score_reward": -0.864478588104248, + "objective/rlhf_reward": 0.9420856922864917, + "objective/scores": 1.1, + "policy/approxkl_avg": 79.18849182128906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6703629493713379, + "step": 285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975764751434326 + }, + { + "episode": 4592, + "epoch": 0.08253945429054176, + "loss/policy_avg": 0.12739452719688416, + "lr": 2.945168711656442e-06, + "objective/entropy": -44.89176940917969, + "objective/kl": 4.6148176193237305, + "objective/non_score_reward": -0.4614817500114441, + "objective/rlhf_reward": 2.554072973877192, + "objective/scores": 1.1, + "policy/approxkl_avg": 13.531970024108887, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6312613487243652, + "step": 286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990942478179932 + }, + { + "episode": 4608, + "epoch": 0.0828270482079304, + "loss/policy_avg": 0.13534197211265564, + "lr": 2.9449769938650308e-06, + "objective/entropy": 234.43975830078125, + "objective/kl": 9.361823081970215, + "objective/non_score_reward": -0.9361822605133057, + "objective/rlhf_reward": -3.3447290569543835, + "objective/scores": 0.1, + "policy/approxkl_avg": 136.4779815673828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5726273059844971, + "step": 287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99815034866333 + }, + { + "episode": 4624, + "epoch": 0.08311464212531905, + "loss/policy_avg": 0.42366448044776917, + "lr": 2.9447852760736196e-06, + "objective/entropy": -113.96307373046875, + "objective/kl": 8.458015441894531, + "objective/non_score_reward": -0.845801591873169, + "objective/rlhf_reward": -5.383206367492676, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.63624572753906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7157449126243591, + "step": 288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999110221862793 + }, + { + "episode": 4640, + "epoch": 0.0834022360427077, + "loss/policy_avg": 0.5396846532821655, + "lr": 2.944593558282209e-06, + "objective/entropy": 132.98178100585938, + "objective/kl": 13.068355560302734, + "objective/non_score_reward": -1.3068355321884155, + "objective/rlhf_reward": -2.8273421287536618, + "objective/scores": 0.6, + "policy/approxkl_avg": 203.39913940429688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8317103385925293, + "step": 289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000296115875244 + }, + { + "episode": 4656, + "epoch": 0.08368982996009634, + "loss/policy_avg": 0.19873766601085663, + "lr": 2.9444018404907976e-06, + "objective/entropy": -150.62936401367188, + "objective/kl": 12.215543746948242, + "objective/non_score_reward": -1.2215545177459717, + "objective/rlhf_reward": -3.282097909514027, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 94.99072265625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8213875889778137, + "step": 290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985655546188354 + }, + { + "episode": 4672, + "epoch": 0.083977423877485, + "loss/policy_avg": 0.02723608911037445, + "lr": 2.9442101226993864e-06, + "objective/entropy": 122.41561889648438, + "objective/kl": 8.34988784790039, + "objective/non_score_reward": -0.8349887132644653, + "objective/rlhf_reward": -5.3399553298950195, + "objective/scores": -0.5, + "policy/approxkl_avg": 45.76279067993164, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.34027427434921265, + "step": 291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0011491775512695 + }, + { + "episode": 4688, + "epoch": 0.08426501779487364, + "loss/policy_avg": 0.32025736570358276, + "lr": 2.9440184049079756e-06, + "objective/entropy": -45.17048645019531, + "objective/kl": 4.012126445770264, + "objective/non_score_reward": -0.4012127220630646, + "objective/rlhf_reward": 2.795149059593678, + "objective/scores": 1.1, + "policy/approxkl_avg": 35.060951232910156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4729318916797638, + "step": 292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977355003356934 + }, + { + "episode": 4704, + "epoch": 0.08455261171226229, + "loss/policy_avg": 0.22684016823768616, + "lr": 2.9438266871165645e-06, + "objective/entropy": 16.97724151611328, + "objective/kl": 14.390579223632812, + "objective/non_score_reward": -1.4390579462051392, + "objective/rlhf_reward": -3.808820496277745, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 180.50299072265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6010082960128784, + "step": 293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9963459968566895 + }, + { + "episode": 4720, + "epoch": 0.08484020562965093, + "loss/policy_avg": 0.44294729828834534, + "lr": 2.9436349693251537e-06, + "objective/entropy": -67.02993774414062, + "objective/kl": 11.463648796081543, + "objective/non_score_reward": -1.1463651657104492, + "objective/rlhf_reward": -0.1854602456092831, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.088623046875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5088694095611572, + "step": 294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967097043991089 + }, + { + "episode": 4736, + "epoch": 0.08512779954703958, + "loss/policy_avg": 0.12860551476478577, + "lr": 2.9434432515337425e-06, + "objective/entropy": 73.75012969970703, + "objective/kl": 11.233770370483398, + "objective/non_score_reward": -1.1233770847320557, + "objective/rlhf_reward": -4.093508290499448, + "objective/scores": 0.1, + "policy/approxkl_avg": 110.15925598144531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49316030740737915, + "step": 295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9951969385147095 + }, + { + "episode": 4752, + "epoch": 0.08541539346442822, + "loss/policy_avg": 0.4201592803001404, + "lr": 2.9432515337423313e-06, + "objective/entropy": 43.006744384765625, + "objective/kl": 12.237357139587402, + "objective/non_score_reward": -1.2237358093261719, + "objective/rlhf_reward": -6.8949432373046875, + "objective/scores": -0.5, + "policy/approxkl_avg": 152.20016479492188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7024413347244263, + "step": 296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9959149360656738 + }, + { + "episode": 4768, + "epoch": 0.08570298738181688, + "loss/policy_avg": 0.17788049578666687, + "lr": 2.9430598159509205e-06, + "objective/entropy": -236.15725708007812, + "objective/kl": 9.120914459228516, + "objective/non_score_reward": -0.9120914936065674, + "objective/rlhf_reward": 0.751633965969086, + "objective/scores": 1.1, + "policy/approxkl_avg": 85.18730163574219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6455779671669006, + "step": 297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998746395111084 + }, + { + "episode": 4784, + "epoch": 0.08599058129920552, + "loss/policy_avg": -0.09744630753993988, + "lr": 2.9428680981595093e-06, + "objective/entropy": 28.533233642578125, + "objective/kl": 6.665700912475586, + "objective/non_score_reward": -0.6665701270103455, + "objective/rlhf_reward": -4.666280269622803, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.560096740722656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8575639724731445, + "step": 298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0005359649658203 + }, + { + "episode": 4800, + "epoch": 0.08627817521659417, + "loss/policy_avg": 0.11877211183309555, + "lr": 2.9426763803680986e-06, + "objective/entropy": 163.72457885742188, + "objective/kl": 7.44589900970459, + "objective/non_score_reward": -0.7445899248123169, + "objective/rlhf_reward": 1.4216401070356373, + "objective/scores": 1.1, + "policy/approxkl_avg": 10.336346626281738, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6966899633407593, + "step": 299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998868703842163 + }, + { + "episode": 4816, + "epoch": 0.08656576913398281, + "loss/policy_avg": 0.3418015241622925, + "lr": 2.9424846625766874e-06, + "objective/entropy": -112.08236694335938, + "objective/kl": 12.987334251403809, + "objective/non_score_reward": -1.2987333536148071, + "objective/rlhf_reward": -0.7949335634708401, + "objective/scores": 1.1, + "policy/approxkl_avg": 233.1175994873047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6791957020759583, + "step": 300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992109537124634 + }, + { + "episode": 4832, + "epoch": 0.08685336305137147, + "loss/policy_avg": 0.17691361904144287, + "lr": 2.942292944785276e-06, + "objective/entropy": 261.80804443359375, + "objective/kl": 16.804275512695312, + "objective/non_score_reward": -1.6804277896881104, + "objective/rlhf_reward": -6.321710979938507, + "objective/scores": 0.1, + "policy/approxkl_avg": 268.57550048828125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6984161734580994, + "step": 301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9969482421875 + }, + { + "episode": 4848, + "epoch": 0.0871409569687601, + "loss/policy_avg": 0.16571223735809326, + "lr": 2.942101226993865e-06, + "objective/entropy": 186.30453491210938, + "objective/kl": 7.996967315673828, + "objective/non_score_reward": -0.799696683883667, + "objective/rlhf_reward": -5.198786735534668, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.46959686279297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5912961959838867, + "step": 302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0007529258728027 + }, + { + "episode": 4864, + "epoch": 0.08742855088614876, + "loss/policy_avg": 0.19111140072345734, + "lr": 2.941909509202454e-06, + "objective/entropy": -6.660182952880859, + "objective/kl": 9.802804946899414, + "objective/non_score_reward": -0.9802805781364441, + "objective/rlhf_reward": -5.9211225509643555, + "objective/scores": -0.5, + "policy/approxkl_avg": 100.47252655029297, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6817684173583984, + "step": 303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000228881835938 + }, + { + "episode": 4880, + "epoch": 0.0877161448035374, + "loss/policy_avg": 0.050571128726005554, + "lr": 2.941717791411043e-06, + "objective/entropy": 106.45433807373047, + "objective/kl": 15.282678604125977, + "objective/non_score_reward": -1.528267741203308, + "objective/rlhf_reward": -8.11307144165039, + "objective/scores": -0.5, + "policy/approxkl_avg": 272.7275085449219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5246777534484863, + "step": 304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000553131103516 + }, + { + "episode": 4896, + "epoch": 0.08800373872092605, + "loss/policy_avg": 0.5568979978561401, + "lr": 2.941526073619632e-06, + "objective/entropy": 16.014564514160156, + "objective/kl": 9.838717460632324, + "objective/non_score_reward": -0.9838719367980957, + "objective/rlhf_reward": -5.935487747192383, + "objective/scores": -0.5, + "policy/approxkl_avg": 102.69760131835938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5866303443908691, + "step": 305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998826026916504 + }, + { + "episode": 4912, + "epoch": 0.08829133263831469, + "loss/policy_avg": 0.19343560934066772, + "lr": 2.9413343558282207e-06, + "objective/entropy": -7.7786407470703125, + "objective/kl": 13.301519393920898, + "objective/non_score_reward": -1.3301520347595215, + "objective/rlhf_reward": -7.320608139038086, + "objective/scores": -0.5, + "policy/approxkl_avg": 99.96537780761719, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6204877495765686, + "step": 306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997145414352417 + }, + { + "episode": 4928, + "epoch": 0.08857892655570335, + "loss/policy_avg": 0.4353540539741516, + "lr": 2.94114263803681e-06, + "objective/entropy": 180.70339965820312, + "objective/kl": 15.014328002929688, + "objective/non_score_reward": -1.5014326572418213, + "objective/rlhf_reward": -4.343870957911598, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 205.107666015625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5958458185195923, + "step": 307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999456763267517 + }, + { + "episode": 4944, + "epoch": 0.088866520473092, + "loss/policy_avg": 0.044183149933815, + "lr": 2.9409509202453987e-06, + "objective/entropy": 147.0219268798828, + "objective/kl": 7.249485969543457, + "objective/non_score_reward": -0.7249486446380615, + "objective/rlhf_reward": -2.49979438483715, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.019502639770508, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8432949185371399, + "step": 308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000023126602173 + }, + { + "episode": 4960, + "epoch": 0.08915411439048064, + "loss/policy_avg": -0.1347326934337616, + "lr": 2.940759202453988e-06, + "objective/entropy": -102.82943725585938, + "objective/kl": 8.855981826782227, + "objective/non_score_reward": -0.8855981826782227, + "objective/rlhf_reward": -5.542392730712891, + "objective/scores": -0.5, + "policy/approxkl_avg": 78.90658569335938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6382081508636475, + "step": 309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998550415039062 + }, + { + "episode": 4976, + "epoch": 0.0894417083078693, + "loss/policy_avg": 0.2769806981086731, + "lr": 2.9405674846625768e-06, + "objective/entropy": 222.6593017578125, + "objective/kl": 12.968841552734375, + "objective/non_score_reward": -1.2968841791152954, + "objective/rlhf_reward": -0.7875365525484082, + "objective/scores": 1.1, + "policy/approxkl_avg": 143.65463256835938, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.644939661026001, + "step": 310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0016446113586426 + }, + { + "episode": 4992, + "epoch": 0.08972930222525793, + "loss/policy_avg": -0.31937313079833984, + "lr": 2.9403757668711656e-06, + "objective/entropy": 224.8351593017578, + "objective/kl": 11.696734428405762, + "objective/non_score_reward": -1.1696734428405762, + "objective/rlhf_reward": -2.5559872857489925, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 78.87408447265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.729614794254303, + "step": 311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000832080841064 + }, + { + "episode": 5008, + "epoch": 0.09001689614264659, + "loss/policy_avg": 0.6097627878189087, + "lr": 2.940184049079755e-06, + "objective/entropy": 106.34514617919922, + "objective/kl": 8.760353088378906, + "objective/non_score_reward": -0.8760353326797485, + "objective/rlhf_reward": -5.504140853881836, + "objective/scores": -0.5, + "policy/approxkl_avg": 121.40504455566406, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6102486848831177, + "step": 312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006418228149414 + }, + { + "episode": 5024, + "epoch": 0.09030449006003523, + "loss/policy_avg": 0.2938482463359833, + "lr": 2.9399923312883436e-06, + "objective/entropy": -114.29545593261719, + "objective/kl": 8.454465866088867, + "objective/non_score_reward": -0.8454465866088867, + "objective/rlhf_reward": -2.9817864209413525, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.16905212402344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6143299341201782, + "step": 313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9990007877349854 + }, + { + "episode": 5040, + "epoch": 0.09059208397742388, + "loss/policy_avg": 0.1566888689994812, + "lr": 2.939800613496933e-06, + "objective/entropy": 114.07573699951172, + "objective/kl": 5.000811576843262, + "objective/non_score_reward": -0.5000811219215393, + "objective/rlhf_reward": -4.000324726104736, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.11898422241211, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5816758871078491, + "step": 314, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986605644226074 + }, + { + "episode": 5056, + "epoch": 0.09087967789481252, + "loss/policy_avg": 0.08153313398361206, + "lr": 2.9396088957055217e-06, + "objective/entropy": -2.472991943359375, + "objective/kl": 15.302618026733398, + "objective/non_score_reward": -1.5302616357803345, + "objective/rlhf_reward": -4.5169265604654125, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 221.7631072998047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7500788569450378, + "step": 315, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996685266494751 + }, + { + "episode": 5072, + "epoch": 0.09116727181220118, + "loss/policy_avg": -0.22540059685707092, + "lr": 2.9394171779141105e-06, + "objective/entropy": 194.83139038085938, + "objective/kl": 6.124555587768555, + "objective/non_score_reward": -0.6124556064605713, + "objective/rlhf_reward": -4.449822425842285, + "objective/scores": -0.5, + "policy/approxkl_avg": 52.98652648925781, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4853079319000244, + "step": 316, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0110392570495605 + }, + { + "episode": 5088, + "epoch": 0.09145486572958982, + "loss/policy_avg": 0.33575987815856934, + "lr": 2.9392254601226997e-06, + "objective/entropy": -246.2069854736328, + "objective/kl": 6.744620323181152, + "objective/non_score_reward": -0.674461841583252, + "objective/rlhf_reward": -0.7504362864064533, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 44.58488464355469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5204676389694214, + "step": 317, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0011723041534424 + }, + { + "episode": 5104, + "epoch": 0.09174245964697847, + "loss/policy_avg": -0.03650900349020958, + "lr": 2.9390337423312885e-06, + "objective/entropy": -38.37519454956055, + "objective/kl": 12.865215301513672, + "objective/non_score_reward": -1.2865217924118042, + "objective/rlhf_reward": -7.146087169647217, + "objective/scores": -0.5, + "policy/approxkl_avg": 179.89398193359375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7440187931060791, + "step": 318, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0004358291625977 + }, + { + "episode": 5120, + "epoch": 0.09203005356436711, + "loss/policy_avg": 0.39611124992370605, + "lr": 2.9388420245398773e-06, + "objective/entropy": -115.19349670410156, + "objective/kl": 7.894246578216553, + "objective/non_score_reward": -0.7894245982170105, + "objective/rlhf_reward": -5.157698631286621, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.43158721923828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6976910829544067, + "step": 319, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998779296875 + }, + { + "episode": 5136, + "epoch": 0.09231764748175576, + "loss/policy_avg": 0.6114602088928223, + "lr": 2.9386503067484665e-06, + "objective/entropy": 105.1552734375, + "objective/kl": 13.109886169433594, + "objective/non_score_reward": -1.3109886646270752, + "objective/rlhf_reward": -0.8439547479152676, + "objective/scores": 1.1, + "policy/approxkl_avg": 112.3132095336914, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5804147720336914, + "step": 320, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989616870880127 + }, + { + "episode": 5152, + "epoch": 0.0926052413991444, + "loss/policy_avg": 0.0743880569934845, + "lr": 2.9384585889570554e-06, + "objective/entropy": 77.44183349609375, + "objective/kl": 8.181852340698242, + "objective/non_score_reward": -0.8181852698326111, + "objective/rlhf_reward": -1.1500348619380332, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 82.51998901367188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5503402948379517, + "step": 321, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0014610290527344 + }, + { + "episode": 5168, + "epoch": 0.09289283531653306, + "loss/policy_avg": 0.2947062849998474, + "lr": 2.9382668711656446e-06, + "objective/entropy": 6.5170745849609375, + "objective/kl": 14.03689956665039, + "objective/non_score_reward": -1.4036900997161865, + "objective/rlhf_reward": -1.214760041236877, + "objective/scores": 1.1, + "policy/approxkl_avg": 161.59352111816406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5341845750808716, + "step": 322, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988269805908203 + }, + { + "episode": 5184, + "epoch": 0.0931804292339217, + "loss/policy_avg": 0.2814280092716217, + "lr": 2.938075153374233e-06, + "objective/entropy": -135.19436645507812, + "objective/kl": 8.1387357711792, + "objective/non_score_reward": -0.8138736486434937, + "objective/rlhf_reward": -2.8554944455623623, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.818918228149414, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.581717312335968, + "step": 323, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997791051864624 + }, + { + "episode": 5200, + "epoch": 0.09346802315131035, + "loss/policy_avg": 0.36961764097213745, + "lr": 2.937883435582822e-06, + "objective/entropy": -77.63428497314453, + "objective/kl": 9.158490180969238, + "objective/non_score_reward": -0.915848970413208, + "objective/rlhf_reward": -5.663395881652832, + "objective/scores": -0.5, + "policy/approxkl_avg": 134.7732696533203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5512831807136536, + "step": 324, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.995112419128418 + }, + { + "episode": 5216, + "epoch": 0.09375561706869899, + "loss/policy_avg": 0.11706581711769104, + "lr": 2.937691717791411e-06, + "objective/entropy": -31.434471130371094, + "objective/kl": 15.156240463256836, + "objective/non_score_reward": -1.5156242847442627, + "objective/rlhf_reward": -8.06249713897705, + "objective/scores": -0.5, + "policy/approxkl_avg": 217.86083984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.458157479763031, + "step": 325, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997313380241394 + }, + { + "episode": 5232, + "epoch": 0.09404321098608764, + "loss/policy_avg": 0.08816379308700562, + "lr": 2.9375e-06, + "objective/entropy": 182.1945343017578, + "objective/kl": 14.343957901000977, + "objective/non_score_reward": -1.4343959093093872, + "objective/rlhf_reward": -2.813864682556364, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 90.6133804321289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.763087272644043, + "step": 326, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991672039031982 + }, + { + "episode": 5248, + "epoch": 0.0943308049034763, + "loss/policy_avg": 0.4887702167034149, + "lr": 2.937308282208589e-06, + "objective/entropy": 276.0743103027344, + "objective/kl": 19.531585693359375, + "objective/non_score_reward": -1.9531588554382324, + "objective/rlhf_reward": -7.412635026872159, + "objective/scores": 0.1, + "policy/approxkl_avg": 292.0692138671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6329343318939209, + "step": 327, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973821640014648 + }, + { + "episode": 5264, + "epoch": 0.09461839882086494, + "loss/policy_avg": 0.19927456974983215, + "lr": 2.937116564417178e-06, + "objective/entropy": -68.87179565429688, + "objective/kl": 9.256897926330566, + "objective/non_score_reward": -0.9256898164749146, + "objective/rlhf_reward": -5.702759265899658, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.61810302734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.825851321220398, + "step": 328, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997621774673462 + }, + { + "episode": 5280, + "epoch": 0.09490599273825359, + "loss/policy_avg": 0.05179551616311073, + "lr": 2.9369248466257667e-06, + "objective/entropy": 154.8633575439453, + "objective/kl": 11.498334884643555, + "objective/non_score_reward": -1.1498336791992188, + "objective/rlhf_reward": -1.6756155534994333, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 41.51012420654297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6221466064453125, + "step": 329, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999433994293213 + }, + { + "episode": 5296, + "epoch": 0.09519358665564223, + "loss/policy_avg": 0.7231928110122681, + "lr": 2.936733128834356e-06, + "objective/entropy": 186.2045135498047, + "objective/kl": 10.730362892150879, + "objective/non_score_reward": -1.0730363130569458, + "objective/rlhf_reward": -3.8921451330184933, + "objective/scores": 0.1, + "policy/approxkl_avg": 121.8580322265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7908709645271301, + "step": 330, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002248287200928 + }, + { + "episode": 5312, + "epoch": 0.09548118057303089, + "loss/policy_avg": 0.2208048403263092, + "lr": 2.9365414110429447e-06, + "objective/entropy": -59.700164794921875, + "objective/kl": 16.838346481323242, + "objective/non_score_reward": -1.6838349103927612, + "objective/rlhf_reward": -8.735339164733887, + "objective/scores": -0.5, + "policy/approxkl_avg": 154.73214721679688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7444063425064087, + "step": 331, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9977869987487793 + }, + { + "episode": 5328, + "epoch": 0.09576877449041953, + "loss/policy_avg": 0.06984035670757294, + "lr": 2.936349693251534e-06, + "objective/entropy": 54.201515197753906, + "objective/kl": 8.278887748718262, + "objective/non_score_reward": -0.8278888463973999, + "objective/rlhf_reward": -5.311555862426758, + "objective/scores": -0.5, + "policy/approxkl_avg": 91.28977966308594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7553679943084717, + "step": 332, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997904300689697 + }, + { + "episode": 5344, + "epoch": 0.09605636840780818, + "loss/policy_avg": 0.3257616460323334, + "lr": 2.9361579754601228e-06, + "objective/entropy": -28.56784439086914, + "objective/kl": 12.646832466125488, + "objective/non_score_reward": -1.2646832466125488, + "objective/rlhf_reward": -7.058732986450195, + "objective/scores": -0.5, + "policy/approxkl_avg": 129.71322631835938, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5907671451568604, + "step": 333, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982057809829712 + }, + { + "episode": 5360, + "epoch": 0.09634396232519682, + "loss/policy_avg": 0.4729722738265991, + "lr": 2.9359662576687116e-06, + "objective/entropy": -150.07943725585938, + "objective/kl": 11.293041229248047, + "objective/non_score_reward": -1.1293039321899414, + "objective/rlhf_reward": -4.1172159075737, + "objective/scores": 0.1, + "policy/approxkl_avg": 144.25387573242188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6657143831253052, + "step": 334, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998594045639038 + }, + { + "episode": 5376, + "epoch": 0.09663155624258547, + "loss/policy_avg": 0.2886536419391632, + "lr": 2.935774539877301e-06, + "objective/entropy": -131.1785125732422, + "objective/kl": 10.785483360290527, + "objective/non_score_reward": -1.0785483121871948, + "objective/rlhf_reward": -3.9141932338476177, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.560306549072266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8351963758468628, + "step": 335, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998948335647583 + }, + { + "episode": 5392, + "epoch": 0.09691915015997411, + "loss/policy_avg": 1.0824708938598633, + "lr": 2.9355828220858896e-06, + "objective/entropy": -5.367637634277344, + "objective/kl": 14.300538063049316, + "objective/non_score_reward": -1.430053949356079, + "objective/rlhf_reward": -2.7964971407663555, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 151.68798828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5800391435623169, + "step": 336, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980026483535767 + }, + { + "episode": 5408, + "epoch": 0.09720674407736277, + "loss/policy_avg": 0.3940733075141907, + "lr": 2.935391104294479e-06, + "objective/entropy": -165.78109741210938, + "objective/kl": 15.360054016113281, + "objective/non_score_reward": -1.5360053777694702, + "objective/rlhf_reward": -1.7440215110778805, + "objective/scores": 1.1, + "policy/approxkl_avg": 114.84159088134766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6686538457870483, + "step": 337, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000908374786377 + }, + { + "episode": 5424, + "epoch": 0.0974943379947514, + "loss/policy_avg": 0.07578772306442261, + "lr": 2.9351993865030677e-06, + "objective/entropy": 17.821250915527344, + "objective/kl": 17.395343780517578, + "objective/non_score_reward": -1.739534616470337, + "objective/rlhf_reward": -4.83543178655294, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 294.8856506347656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6329281330108643, + "step": 338, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982552528381348 + }, + { + "episode": 5440, + "epoch": 0.09778193191214006, + "loss/policy_avg": -0.0766182690858841, + "lr": 2.9350076687116565e-06, + "objective/entropy": -72.60086059570312, + "objective/kl": 9.59086799621582, + "objective/non_score_reward": -0.9590868949890137, + "objective/rlhf_reward": -5.836347579956055, + "objective/scores": -0.5, + "policy/approxkl_avg": 61.654563903808594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8256030678749084, + "step": 339, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.002091407775879 + }, + { + "episode": 5456, + "epoch": 0.0980695258295287, + "loss/policy_avg": 0.039667725563049316, + "lr": 2.9348159509202457e-06, + "objective/entropy": 194.9036865234375, + "objective/kl": 9.621345520019531, + "objective/non_score_reward": -0.9621344804763794, + "objective/rlhf_reward": -3.4485378623008724, + "objective/scores": 0.1, + "policy/approxkl_avg": 69.39463806152344, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.49096935987472534, + "step": 340, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0042881965637207 + }, + { + "episode": 5472, + "epoch": 0.09835711974691735, + "loss/policy_avg": 0.37447261810302734, + "lr": 2.9346242331288345e-06, + "objective/entropy": 5.140836715698242, + "objective/kl": 12.80933666229248, + "objective/non_score_reward": -1.2809334993362427, + "objective/rlhf_reward": -3.001028003469978, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 136.77401733398438, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6044674515724182, + "step": 341, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971853494644165 + }, + { + "episode": 5488, + "epoch": 0.098644713664306, + "loss/policy_avg": 0.07032056152820587, + "lr": 2.9344325153374233e-06, + "objective/entropy": 34.30079650878906, + "objective/kl": 11.535228729248047, + "objective/non_score_reward": -1.1535229682922363, + "objective/rlhf_reward": -2.4913854024567943, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 80.33157348632812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5720343589782715, + "step": 342, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973433017730713 + }, + { + "episode": 5504, + "epoch": 0.09893230758169465, + "loss/policy_avg": 0.039407968521118164, + "lr": 2.9342407975460126e-06, + "objective/entropy": 29.278091430664062, + "objective/kl": 1.9333374500274658, + "objective/non_score_reward": -0.19333375990390778, + "objective/rlhf_reward": 1.0514937088164578, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 2.3760266304016113, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8676018714904785, + "step": 343, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0017752647399902 + }, + { + "episode": 5520, + "epoch": 0.09921990149908329, + "loss/policy_avg": 0.3448472023010254, + "lr": 2.9340490797546014e-06, + "objective/entropy": -73.16712951660156, + "objective/kl": 12.475850105285645, + "objective/non_score_reward": -1.2475850582122803, + "objective/rlhf_reward": -2.066621486784193, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 89.76551818847656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4174689054489136, + "step": 344, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997223973274231 + }, + { + "episode": 5536, + "epoch": 0.09950749541647194, + "loss/policy_avg": 0.019372761249542236, + "lr": 2.93385736196319e-06, + "objective/entropy": 100.60924530029297, + "objective/kl": 12.531920433044434, + "objective/non_score_reward": -1.2531919479370117, + "objective/rlhf_reward": -4.612768149375915, + "objective/scores": 0.1, + "policy/approxkl_avg": 26.791622161865234, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.47664302587509155, + "step": 345, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0009078979492188 + }, + { + "episode": 5552, + "epoch": 0.0997950893338606, + "loss/policy_avg": 0.6332914233207703, + "lr": 2.933665644171779e-06, + "objective/entropy": 134.54344177246094, + "objective/kl": 11.1735200881958, + "objective/non_score_reward": -1.11735200881958, + "objective/rlhf_reward": -4.0694083034992214, + "objective/scores": 0.1, + "policy/approxkl_avg": 68.55665588378906, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.561040997505188, + "step": 346, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998960256576538 + }, + { + "episode": 5568, + "epoch": 0.10008268325124924, + "loss/policy_avg": 0.11013670265674591, + "lr": 2.9334739263803682e-06, + "objective/entropy": -63.645904541015625, + "objective/kl": 14.62928581237793, + "objective/non_score_reward": -1.4629285335540771, + "objective/rlhf_reward": -7.851714134216309, + "objective/scores": -0.5, + "policy/approxkl_avg": 86.95831298828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7983701229095459, + "step": 347, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993393421173096 + }, + { + "episode": 5584, + "epoch": 0.10037027716863789, + "loss/policy_avg": 0.5474449396133423, + "lr": 2.933282208588957e-06, + "objective/entropy": -133.3090362548828, + "objective/kl": 13.566909790039062, + "objective/non_score_reward": -1.3566908836364746, + "objective/rlhf_reward": -1.0267638623714443, + "objective/scores": 1.1, + "policy/approxkl_avg": 108.0693359375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7627835273742676, + "step": 348, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999748706817627 + }, + { + "episode": 5600, + "epoch": 0.10065787108602653, + "loss/policy_avg": 0.07263286411762238, + "lr": 2.933090490797546e-06, + "objective/entropy": 126.58871459960938, + "objective/kl": 9.20844554901123, + "objective/non_score_reward": -0.920844554901123, + "objective/rlhf_reward": 0.7166217207908634, + "objective/scores": 1.1, + "policy/approxkl_avg": 49.46379089355469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9294000864028931, + "step": 349, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979853630065918 + }, + { + "episode": 5616, + "epoch": 0.10094546500341518, + "loss/policy_avg": 0.1791333109140396, + "lr": 2.932898773006135e-06, + "objective/entropy": 0.43863677978515625, + "objective/kl": 8.826881408691406, + "objective/non_score_reward": -0.8826882243156433, + "objective/rlhf_reward": -1.130752792954445, + "objective/scores": 0.6, + "policy/approxkl_avg": 41.58755874633789, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6624951362609863, + "step": 350, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989933967590332 + }, + { + "episode": 5632, + "epoch": 0.10123305892080382, + "loss/policy_avg": 0.20636233687400818, + "lr": 2.932707055214724e-06, + "objective/entropy": 150.28713989257812, + "objective/kl": 8.030426025390625, + "objective/non_score_reward": -0.8030425906181335, + "objective/rlhf_reward": -5.212170124053955, + "objective/scores": -0.5, + "policy/approxkl_avg": 98.02102661132812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7396011352539062, + "step": 351, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004942417144775 + }, + { + "episode": 5648, + "epoch": 0.10152065283819248, + "loss/policy_avg": 0.07704152166843414, + "lr": 2.932515337423313e-06, + "objective/entropy": 49.47189712524414, + "objective/kl": 14.631547927856445, + "objective/non_score_reward": -1.4631547927856445, + "objective/rlhf_reward": -5.45261919721961, + "objective/scores": 0.1, + "policy/approxkl_avg": 92.44937133789062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7191329002380371, + "step": 352, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998352527618408 + }, + { + "episode": 5664, + "epoch": 0.10180824675558112, + "loss/policy_avg": 0.09739228338003159, + "lr": 2.932323619631902e-06, + "objective/entropy": -24.802486419677734, + "objective/kl": 10.526655197143555, + "objective/non_score_reward": -1.0526655912399292, + "objective/rlhf_reward": 0.1893375083804134, + "objective/scores": 1.1, + "policy/approxkl_avg": 132.25132751464844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6535590887069702, + "step": 353, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997605562210083 + }, + { + "episode": 5680, + "epoch": 0.10209584067296977, + "loss/policy_avg": 0.13763202726840973, + "lr": 2.9321319018404907e-06, + "objective/entropy": -119.98158264160156, + "objective/kl": 5.913897514343262, + "objective/non_score_reward": -0.5913897752761841, + "objective/rlhf_reward": -1.9655589669942855, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.558517456054688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6357654333114624, + "step": 354, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000248908996582 + }, + { + "episode": 5696, + "epoch": 0.10238343459035841, + "loss/policy_avg": 0.527988851070404, + "lr": 2.93194018404908e-06, + "objective/entropy": -29.399810791015625, + "objective/kl": 13.658191680908203, + "objective/non_score_reward": -1.3658192157745361, + "objective/rlhf_reward": -3.063276922702789, + "objective/scores": 0.6, + "policy/approxkl_avg": 92.77324676513672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.728462815284729, + "step": 355, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9965999126434326 + }, + { + "episode": 5712, + "epoch": 0.10267102850774706, + "loss/policy_avg": 0.6842390298843384, + "lr": 2.9317484662576688e-06, + "objective/entropy": 109.09453582763672, + "objective/kl": 10.667269706726074, + "objective/non_score_reward": -1.0667269229888916, + "objective/rlhf_reward": -2.710648416486338, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 69.14006042480469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7146704196929932, + "step": 356, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998762607574463 + }, + { + "episode": 5728, + "epoch": 0.1029586224251357, + "loss/policy_avg": 0.07212770730257034, + "lr": 2.9315567484662576e-06, + "objective/entropy": -49.94731903076172, + "objective/kl": 2.350062370300293, + "objective/non_score_reward": -0.23500625789165497, + "objective/rlhf_reward": -0.5400250017642976, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.055467128753662, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5184276103973389, + "step": 357, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0012760162353516 + }, + { + "episode": 5744, + "epoch": 0.10324621634252436, + "loss/policy_avg": -0.052633900195360184, + "lr": 2.931365030674847e-06, + "objective/entropy": -12.867652893066406, + "objective/kl": 7.817996025085449, + "objective/non_score_reward": -0.7817996740341187, + "objective/rlhf_reward": -5.127198219299316, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.15538787841797, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7452627420425415, + "step": 358, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967327117919922 + }, + { + "episode": 5760, + "epoch": 0.103533810259913, + "loss/policy_avg": 0.3177230954170227, + "lr": 2.9311733128834356e-06, + "objective/entropy": 126.2174301147461, + "objective/kl": 15.039046287536621, + "objective/non_score_reward": -1.5039048194885254, + "objective/rlhf_reward": -8.015619277954102, + "objective/scores": -0.5, + "policy/approxkl_avg": 148.06988525390625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5116921067237854, + "step": 359, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9969123601913452 + }, + { + "episode": 5776, + "epoch": 0.10382140417730165, + "loss/policy_avg": 0.015442397445440292, + "lr": 2.930981595092025e-06, + "objective/entropy": 113.17227935791016, + "objective/kl": 15.746637344360352, + "objective/non_score_reward": -1.574663758277893, + "objective/rlhf_reward": -5.898655241727829, + "objective/scores": 0.1, + "policy/approxkl_avg": 97.0933837890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6202758550643921, + "step": 360, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986428022384644 + }, + { + "episode": 5792, + "epoch": 0.10410899809469029, + "loss/policy_avg": 0.2362920194864273, + "lr": 2.9307898773006137e-06, + "objective/entropy": -133.8544464111328, + "objective/kl": 9.721014022827148, + "objective/non_score_reward": -0.9721014499664307, + "objective/rlhf_reward": 0.5115940213203434, + "objective/scores": 1.1, + "policy/approxkl_avg": 94.33212280273438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6412885189056396, + "step": 361, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975833892822266 + }, + { + "episode": 5808, + "epoch": 0.10439659201207895, + "loss/policy_avg": 0.16871516406536102, + "lr": 2.9305981595092025e-06, + "objective/entropy": -65.79563903808594, + "objective/kl": 13.480731964111328, + "objective/non_score_reward": -1.3480732440948486, + "objective/rlhf_reward": -2.468574051500532, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 170.740234375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7938269972801208, + "step": 362, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986903667449951 + }, + { + "episode": 5824, + "epoch": 0.10468418592946759, + "loss/policy_avg": 0.2472498118877411, + "lr": 2.9304064417177917e-06, + "objective/entropy": 209.15557861328125, + "objective/kl": 8.526037216186523, + "objective/non_score_reward": -0.8526037335395813, + "objective/rlhf_reward": 0.989584976434708, + "objective/scores": 1.1, + "policy/approxkl_avg": 67.1485366821289, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9279449582099915, + "step": 363, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000206470489502 + }, + { + "episode": 5840, + "epoch": 0.10497177984685624, + "loss/policy_avg": -0.05213417112827301, + "lr": 2.9302147239263805e-06, + "objective/entropy": -235.3236541748047, + "objective/kl": 16.88799285888672, + "objective/non_score_reward": -1.688799262046814, + "objective/rlhf_reward": -6.355197063088417, + "objective/scores": 0.1, + "policy/approxkl_avg": 340.7134094238281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7127069234848022, + "step": 364, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987154006958008 + }, + { + "episode": 5856, + "epoch": 0.10525937376424489, + "loss/policy_avg": 0.2484489381313324, + "lr": 2.9300230061349698e-06, + "objective/entropy": -33.450096130371094, + "objective/kl": 13.260076522827148, + "objective/non_score_reward": -1.326007604598999, + "objective/rlhf_reward": -4.9040304780006405, + "objective/scores": 0.1, + "policy/approxkl_avg": 131.92379760742188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5392622947692871, + "step": 365, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975396394729614 + }, + { + "episode": 5872, + "epoch": 0.10554696768163353, + "loss/policy_avg": -0.4047367572784424, + "lr": 2.9298312883435586e-06, + "objective/entropy": 82.97904968261719, + "objective/kl": 7.515501499176025, + "objective/non_score_reward": -0.7515501976013184, + "objective/rlhf_reward": -0.082481582404348, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 43.22978973388672, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5681832432746887, + "step": 366, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999988079071045 + }, + { + "episode": 5888, + "epoch": 0.10583456159902219, + "loss/policy_avg": 0.2642351984977722, + "lr": 2.929639570552147e-06, + "objective/entropy": 75.05047607421875, + "objective/kl": 7.34889030456543, + "objective/non_score_reward": -0.734889030456543, + "objective/rlhf_reward": 1.4604437291622165, + "objective/scores": 1.1, + "policy/approxkl_avg": 24.294891357421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6089534759521484, + "step": 367, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984581470489502 + }, + { + "episode": 5904, + "epoch": 0.10612215551641083, + "loss/policy_avg": 0.4635845720767975, + "lr": 2.929447852760736e-06, + "objective/entropy": 111.49870300292969, + "objective/kl": 14.92940902709961, + "objective/non_score_reward": -1.4929410219192505, + "objective/rlhf_reward": -1.5717640727758404, + "objective/scores": 1.1, + "policy/approxkl_avg": 170.13226318359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.896259069442749, + "step": 368, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.99690842628479 + }, + { + "episode": 5920, + "epoch": 0.10640974943379948, + "loss/policy_avg": 0.0428597554564476, + "lr": 2.929256134969325e-06, + "objective/entropy": -17.536102294921875, + "objective/kl": 13.898289680480957, + "objective/non_score_reward": -1.3898290395736694, + "objective/rlhf_reward": -1.1593163371086117, + "objective/scores": 1.1, + "policy/approxkl_avg": 113.92097473144531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5009787678718567, + "step": 369, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998550534248352 + }, + { + "episode": 5936, + "epoch": 0.10669734335118812, + "loss/policy_avg": 0.4961566627025604, + "lr": 2.9290644171779142e-06, + "objective/entropy": 40.97224426269531, + "objective/kl": 8.79596996307373, + "objective/non_score_reward": -0.8795971274375916, + "objective/rlhf_reward": -0.5946695550691811, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 88.30393981933594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6913511753082275, + "step": 370, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971789121627808 + }, + { + "episode": 5952, + "epoch": 0.10698493726857677, + "loss/policy_avg": 0.3505915701389313, + "lr": 2.928872699386503e-06, + "objective/entropy": -14.86764907836914, + "objective/kl": 13.301910400390625, + "objective/non_score_reward": -1.3301911354064941, + "objective/rlhf_reward": -3.495935942205499, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 67.43904113769531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5367602109909058, + "step": 371, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998258352279663 + }, + { + "episode": 5968, + "epoch": 0.10727253118596541, + "loss/policy_avg": 0.04512263089418411, + "lr": 2.928680981595092e-06, + "objective/entropy": 106.7132568359375, + "objective/kl": 7.053742408752441, + "objective/non_score_reward": -0.7053742408752441, + "objective/rlhf_reward": -2.4214967399835583, + "objective/scores": 0.1, + "policy/approxkl_avg": 64.6439437866211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6978051066398621, + "step": 372, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0001304149627686 + }, + { + "episode": 5984, + "epoch": 0.10756012510335407, + "loss/policy_avg": 0.10745556652545929, + "lr": 2.928489263803681e-06, + "objective/entropy": -95.63072967529297, + "objective/kl": 16.271547317504883, + "objective/non_score_reward": -1.6271545886993408, + "objective/rlhf_reward": -2.1086182355880734, + "objective/scores": 1.1, + "policy/approxkl_avg": 83.08837890625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8408117294311523, + "step": 373, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9974865913391113 + }, + { + "episode": 6000, + "epoch": 0.10784771902074271, + "loss/policy_avg": 0.18899598717689514, + "lr": 2.92829754601227e-06, + "objective/entropy": -149.71536254882812, + "objective/kl": 12.09565544128418, + "objective/non_score_reward": -1.2095654010772705, + "objective/rlhf_reward": -0.438261783123016, + "objective/scores": 1.1, + "policy/approxkl_avg": 102.32780456542969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5978922843933105, + "step": 374, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985668659210205 + }, + { + "episode": 6016, + "epoch": 0.10813531293813136, + "loss/policy_avg": 0.0514984093606472, + "lr": 2.928105828220859e-06, + "objective/entropy": 53.81720733642578, + "objective/kl": 13.123618125915527, + "objective/non_score_reward": -1.3123618364334106, + "objective/rlhf_reward": -3.4246185078945865, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 97.61923217773438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.561882495880127, + "step": 375, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986340999603271 + }, + { + "episode": 6032, + "epoch": 0.10842290685552, + "loss/policy_avg": 0.05128341168165207, + "lr": 2.927914110429448e-06, + "objective/entropy": -142.33819580078125, + "objective/kl": 12.197196960449219, + "objective/non_score_reward": -1.219719648361206, + "objective/rlhf_reward": -3.274758789602833, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 29.826644897460938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7544271945953369, + "step": 376, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0010008811950684 + }, + { + "episode": 6048, + "epoch": 0.10871050077290866, + "loss/policy_avg": 0.3710365891456604, + "lr": 2.9277223926380367e-06, + "objective/entropy": -158.48403930664062, + "objective/kl": 12.475811958312988, + "objective/non_score_reward": -1.2475812435150146, + "objective/rlhf_reward": -6.990324974060059, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.358436584472656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7211358547210693, + "step": 377, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9964120388031006 + }, + { + "episode": 6064, + "epoch": 0.1089980946902973, + "loss/policy_avg": 0.44148433208465576, + "lr": 2.927530674846626e-06, + "objective/entropy": 203.32000732421875, + "objective/kl": 11.192790985107422, + "objective/non_score_reward": -1.1192790269851685, + "objective/rlhf_reward": -0.07711610794067347, + "objective/scores": 1.1, + "policy/approxkl_avg": 140.77037048339844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49844634532928467, + "step": 378, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996138334274292 + }, + { + "episode": 6080, + "epoch": 0.10928568860768595, + "loss/policy_avg": 0.07269307225942612, + "lr": 2.9273389570552148e-06, + "objective/entropy": -48.06731414794922, + "objective/kl": 7.008990287780762, + "objective/non_score_reward": -0.7008991241455078, + "objective/rlhf_reward": -0.40359642207622537, + "objective/scores": 0.6, + "policy/approxkl_avg": 30.824771881103516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.765288233757019, + "step": 379, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009591579437256 + }, + { + "episode": 6096, + "epoch": 0.10957328252507459, + "loss/policy_avg": 0.3331525921821594, + "lr": 2.9271472392638036e-06, + "objective/entropy": -129.8748779296875, + "objective/kl": 6.912174224853516, + "objective/non_score_reward": -0.6912174224853516, + "objective/rlhf_reward": -2.3648696750402447, + "objective/scores": 0.1, + "policy/approxkl_avg": 86.31391143798828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7057325839996338, + "step": 380, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999925136566162 + }, + { + "episode": 6112, + "epoch": 0.10986087644246324, + "loss/policy_avg": 0.2209397256374359, + "lr": 2.926955521472393e-06, + "objective/entropy": -17.206472396850586, + "objective/kl": 12.392889022827148, + "objective/non_score_reward": -1.2392890453338623, + "objective/rlhf_reward": -3.2238229076067606, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 61.33112716674805, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7327935695648193, + "step": 381, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999511957168579 + }, + { + "episode": 6128, + "epoch": 0.11014847035985188, + "loss/policy_avg": 0.015440240502357483, + "lr": 2.9267638036809816e-06, + "objective/entropy": 136.05276489257812, + "objective/kl": 10.93641471862793, + "objective/non_score_reward": -1.0936416387557983, + "objective/rlhf_reward": -3.9745664507150646, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.861133575439453, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.642856240272522, + "step": 382, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005943775177 + }, + { + "episode": 6144, + "epoch": 0.11043606427724054, + "loss/policy_avg": 0.4618483781814575, + "lr": 2.926572085889571e-06, + "objective/entropy": 248.1273193359375, + "objective/kl": 10.450383186340332, + "objective/non_score_reward": -1.0450382232666016, + "objective/rlhf_reward": -6.180152893066406, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.98204231262207, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.585578441619873, + "step": 383, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986340999603271 + }, + { + "episode": 6160, + "epoch": 0.11072365819462919, + "loss/policy_avg": 0.527269721031189, + "lr": 2.9263803680981597e-06, + "objective/entropy": -85.81866455078125, + "objective/kl": 7.232128143310547, + "objective/non_score_reward": -0.7232127785682678, + "objective/rlhf_reward": -0.4928510844707489, + "objective/scores": 0.6, + "policy/approxkl_avg": 35.018524169921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6665592789649963, + "step": 384, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9987972974777222 + }, + { + "episode": 6176, + "epoch": 0.11101125211201783, + "loss/policy_avg": 0.012356449849903584, + "lr": 2.9261886503067485e-06, + "objective/entropy": -87.99043273925781, + "objective/kl": 15.716808319091797, + "objective/non_score_reward": -1.5716807842254639, + "objective/rlhf_reward": -1.8867232263088223, + "objective/scores": 1.1, + "policy/approxkl_avg": 112.06707763671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8166660070419312, + "step": 385, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980388879776 + }, + { + "episode": 6192, + "epoch": 0.11129884602940648, + "loss/policy_avg": 0.21891015768051147, + "lr": 2.9259969325153377e-06, + "objective/entropy": 149.92320251464844, + "objective/kl": 15.745594024658203, + "objective/non_score_reward": -1.5745596885681152, + "objective/rlhf_reward": -8.298238754272461, + "objective/scores": -0.5, + "policy/approxkl_avg": 270.8348693847656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5218294858932495, + "step": 386, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982883930206299 + }, + { + "episode": 6208, + "epoch": 0.11158643994679512, + "loss/policy_avg": 0.1566510945558548, + "lr": 2.9258052147239265e-06, + "objective/entropy": 279.3895263671875, + "objective/kl": 14.221332550048828, + "objective/non_score_reward": -1.4221333265304565, + "objective/rlhf_reward": -7.688533306121826, + "objective/scores": -0.5, + "policy/approxkl_avg": 161.9359588623047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9130169153213501, + "step": 387, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984962940216064 + }, + { + "episode": 6224, + "epoch": 0.11187403386418378, + "loss/policy_avg": 0.3156845271587372, + "lr": 2.9256134969325158e-06, + "objective/entropy": -91.46309661865234, + "objective/kl": 5.5192670822143555, + "objective/non_score_reward": -0.5519267320632935, + "objective/rlhf_reward": -4.207706928253174, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.50323486328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4393225312232971, + "step": 388, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989399909973145 + }, + { + "episode": 6240, + "epoch": 0.11216162778157242, + "loss/policy_avg": -0.24140848219394684, + "lr": 2.925421779141104e-06, + "objective/entropy": -6.909185409545898, + "objective/kl": 7.97199821472168, + "objective/non_score_reward": -0.7971999049186707, + "objective/rlhf_reward": -1.066093357578788, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 99.13380432128906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4111158847808838, + "step": 389, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.03024959564209 + }, + { + "episode": 6256, + "epoch": 0.11244922169896107, + "loss/policy_avg": 0.10580594837665558, + "lr": 2.9252300613496934e-06, + "objective/entropy": 139.240234375, + "objective/kl": 7.618363857269287, + "objective/non_score_reward": -0.7618364095687866, + "objective/rlhf_reward": -0.6473455265164376, + "objective/scores": 0.6, + "policy/approxkl_avg": 75.60391998291016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6820776462554932, + "step": 390, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000373363494873 + }, + { + "episode": 6272, + "epoch": 0.11273681561634971, + "loss/policy_avg": 0.23864039778709412, + "lr": 2.925038343558282e-06, + "objective/entropy": -165.75607299804688, + "objective/kl": 9.443279266357422, + "objective/non_score_reward": -0.9443280696868896, + "objective/rlhf_reward": -5.777312278747559, + "objective/scores": -0.5, + "policy/approxkl_avg": 73.20270538330078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6525593996047974, + "step": 391, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987553358078003 + }, + { + "episode": 6288, + "epoch": 0.11302440953373837, + "loss/policy_avg": 0.01731543242931366, + "lr": 2.924846625766871e-06, + "objective/entropy": 202.19996643066406, + "objective/kl": 15.700738906860352, + "objective/non_score_reward": -1.5700738430023193, + "objective/rlhf_reward": -5.880295610427856, + "objective/scores": 0.1, + "policy/approxkl_avg": 44.909934997558594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5537660121917725, + "step": 392, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973206520080566 + }, + { + "episode": 6304, + "epoch": 0.113312003451127, + "loss/policy_avg": 0.37011197209358215, + "lr": 2.9246549079754602e-06, + "objective/entropy": -100.13412475585938, + "objective/kl": 7.34639835357666, + "objective/non_score_reward": -0.7346398234367371, + "objective/rlhf_reward": -0.5385593235492707, + "objective/scores": 0.6, + "policy/approxkl_avg": 39.805572509765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5902200937271118, + "step": 393, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974175691604614 + }, + { + "episode": 6320, + "epoch": 0.11359959736851566, + "loss/policy_avg": 0.1728161722421646, + "lr": 2.924463190184049e-06, + "objective/entropy": 12.221923828125, + "objective/kl": 8.158146858215332, + "objective/non_score_reward": -0.8158146142959595, + "objective/rlhf_reward": -2.863258576393127, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.08139419555664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6115730404853821, + "step": 394, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9969968795776367 + }, + { + "episode": 6336, + "epoch": 0.1138871912859043, + "loss/policy_avg": 0.1638275384902954, + "lr": 2.924271472392638e-06, + "objective/entropy": -92.78604125976562, + "objective/kl": 10.350639343261719, + "objective/non_score_reward": -1.0350639820098877, + "objective/rlhf_reward": -3.7402558535337445, + "objective/scores": 0.1, + "policy/approxkl_avg": 192.456298828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5784432888031006, + "step": 395, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983556270599365 + }, + { + "episode": 6352, + "epoch": 0.11417478520329295, + "loss/policy_avg": 0.008675817400217056, + "lr": 2.924079754601227e-06, + "objective/entropy": 14.630111694335938, + "objective/kl": 11.06743049621582, + "objective/non_score_reward": -1.1067430973052979, + "objective/rlhf_reward": -4.026972463726997, + "objective/scores": 0.1, + "policy/approxkl_avg": 110.01115417480469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9663007259368896, + "step": 396, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992257356643677 + }, + { + "episode": 6368, + "epoch": 0.11446237912068159, + "loss/policy_avg": 0.22570039331912994, + "lr": 2.923888036809816e-06, + "objective/entropy": -62.594932556152344, + "objective/kl": 11.640623092651367, + "objective/non_score_reward": -1.1640625, + "objective/rlhf_reward": -2.533543454782043, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 53.970237731933594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6753679513931274, + "step": 397, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984841346740723 + }, + { + "episode": 6384, + "epoch": 0.11474997303807025, + "loss/policy_avg": 0.18725144863128662, + "lr": 2.923696319018405e-06, + "objective/entropy": -118.94644165039062, + "objective/kl": 10.655905723571777, + "objective/non_score_reward": -1.0655906200408936, + "objective/rlhf_reward": -3.8623625993728634, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.7579345703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.645980954170227, + "step": 398, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001778602600098 + }, + { + "episode": 6400, + "epoch": 0.11503756695545889, + "loss/policy_avg": 0.04687364026904106, + "lr": 2.923504601226994e-06, + "objective/entropy": 107.14441680908203, + "objective/kl": 5.046243667602539, + "objective/non_score_reward": -0.5046243071556091, + "objective/rlhf_reward": 2.381502674520016, + "objective/scores": 1.1, + "policy/approxkl_avg": 28.62555503845215, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6493499279022217, + "step": 399, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00180721282959 + }, + { + "episode": 6416, + "epoch": 0.11532516087284754, + "loss/policy_avg": 0.18883462250232697, + "lr": 2.9233128834355827e-06, + "objective/entropy": 93.51673126220703, + "objective/kl": 9.593416213989258, + "objective/non_score_reward": -0.9593416452407837, + "objective/rlhf_reward": -1.4373664021492005, + "objective/scores": 0.6, + "policy/approxkl_avg": 43.221275329589844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5561137199401855, + "step": 400, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976122379302979 + }, + { + "episode": 6432, + "epoch": 0.11561275479023618, + "loss/policy_avg": 0.07294710725545883, + "lr": 2.923121165644172e-06, + "objective/entropy": 41.92622375488281, + "objective/kl": 10.513269424438477, + "objective/non_score_reward": -1.0513269901275635, + "objective/rlhf_reward": -6.205307960510254, + "objective/scores": -0.5, + "policy/approxkl_avg": 136.67205810546875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.40027546882629395, + "step": 401, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000309944152832 + }, + { + "episode": 6448, + "epoch": 0.11590034870762483, + "loss/policy_avg": -0.12561793625354767, + "lr": 2.9229294478527608e-06, + "objective/entropy": 82.13600158691406, + "objective/kl": 7.441690444946289, + "objective/non_score_reward": -0.7441689968109131, + "objective/rlhf_reward": -0.05295715177175664, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 81.62566375732422, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7616924047470093, + "step": 402, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0073320865631104 + }, + { + "episode": 6464, + "epoch": 0.11618794262501349, + "loss/policy_avg": 0.19540420174598694, + "lr": 2.92273773006135e-06, + "objective/entropy": -97.04734802246094, + "objective/kl": 9.546144485473633, + "objective/non_score_reward": -0.954614520072937, + "objective/rlhf_reward": -5.818458080291748, + "objective/scores": -0.5, + "policy/approxkl_avg": 41.53440475463867, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6575814485549927, + "step": 403, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998401403427124 + }, + { + "episode": 6480, + "epoch": 0.11647553654240213, + "loss/policy_avg": 0.2724316716194153, + "lr": 2.922546012269939e-06, + "objective/entropy": -222.66941833496094, + "objective/kl": 6.35382080078125, + "objective/non_score_reward": -0.6353820562362671, + "objective/rlhf_reward": -2.141528314352035, + "objective/scores": 0.1, + "policy/approxkl_avg": 45.857295989990234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5307646989822388, + "step": 404, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0005037784576416 + }, + { + "episode": 6496, + "epoch": 0.11676313045979078, + "loss/policy_avg": -0.19320714473724365, + "lr": 2.9223542944785276e-06, + "objective/entropy": 128.687744140625, + "objective/kl": 9.543111801147461, + "objective/non_score_reward": -0.954311192035675, + "objective/rlhf_reward": -2.2131247407832912, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 83.3912353515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5714800953865051, + "step": 405, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.014221668243408 + }, + { + "episode": 6512, + "epoch": 0.11705072437717942, + "loss/policy_avg": -0.02320697158575058, + "lr": 2.922162576687117e-06, + "objective/entropy": -111.4472427368164, + "objective/kl": 14.546286582946777, + "objective/non_score_reward": -1.4546287059783936, + "objective/rlhf_reward": -5.418514943122863, + "objective/scores": 0.1, + "policy/approxkl_avg": 198.98716735839844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6330698728561401, + "step": 406, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000318765640259 + }, + { + "episode": 6528, + "epoch": 0.11733831829456808, + "loss/policy_avg": -0.05689065158367157, + "lr": 2.9219708588957057e-06, + "objective/entropy": -94.29441833496094, + "objective/kl": 6.670037746429443, + "objective/non_score_reward": -0.6670037508010864, + "objective/rlhf_reward": 1.731984862685204, + "objective/scores": 1.1, + "policy/approxkl_avg": 58.32909393310547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4962747097015381, + "step": 407, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0033135414123535 + }, + { + "episode": 6544, + "epoch": 0.11762591221195672, + "loss/policy_avg": 0.058236684650182724, + "lr": 2.9217791411042945e-06, + "objective/entropy": 73.15773010253906, + "objective/kl": 10.113113403320312, + "objective/non_score_reward": -1.011311411857605, + "objective/rlhf_reward": -3.6452456325292584, + "objective/scores": 0.1, + "policy/approxkl_avg": 100.70352172851562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4618947505950928, + "step": 408, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986381530761719 + }, + { + "episode": 6560, + "epoch": 0.11791350612934537, + "loss/policy_avg": 0.2816429138183594, + "lr": 2.9215874233128837e-06, + "objective/entropy": 199.59906005859375, + "objective/kl": 8.274097442626953, + "objective/non_score_reward": -0.8274096846580505, + "objective/rlhf_reward": -5.309638977050781, + "objective/scores": -0.5, + "policy/approxkl_avg": 47.900413513183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4910353124141693, + "step": 409, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000108480453491 + }, + { + "episode": 6576, + "epoch": 0.11820110004673401, + "loss/policy_avg": 0.43750858306884766, + "lr": 2.9213957055214725e-06, + "objective/entropy": 63.67340087890625, + "objective/kl": 17.44285011291504, + "objective/non_score_reward": -1.7442851066589355, + "objective/rlhf_reward": -4.577140188217163, + "objective/scores": 0.6, + "policy/approxkl_avg": 116.11654663085938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7501018047332764, + "step": 410, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976780414581299 + }, + { + "episode": 6592, + "epoch": 0.11848869396412266, + "loss/policy_avg": 0.32210612297058105, + "lr": 2.9212039877300618e-06, + "objective/entropy": -220.1966552734375, + "objective/kl": 11.985102653503418, + "objective/non_score_reward": -1.1985102891921997, + "objective/rlhf_reward": -4.39404130578041, + "objective/scores": 0.1, + "policy/approxkl_avg": 161.2996826171875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5652351379394531, + "step": 411, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979429244995117 + }, + { + "episode": 6608, + "epoch": 0.1187762878815113, + "loss/policy_avg": 0.1565137803554535, + "lr": 2.92101226993865e-06, + "objective/entropy": -16.16411781311035, + "objective/kl": 14.623466491699219, + "objective/non_score_reward": -1.4623467922210693, + "objective/rlhf_reward": -7.849387168884277, + "objective/scores": -0.5, + "policy/approxkl_avg": 106.36109924316406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5213359594345093, + "step": 412, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997380256652832 + }, + { + "episode": 6624, + "epoch": 0.11906388179889996, + "loss/policy_avg": 0.16901980340480804, + "lr": 2.9208205521472394e-06, + "objective/entropy": -32.831336975097656, + "objective/kl": 9.713844299316406, + "objective/non_score_reward": -0.9713844060897827, + "objective/rlhf_reward": -1.9381265444325761, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 27.6256160736084, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5835360288619995, + "step": 413, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000209093093872 + }, + { + "episode": 6640, + "epoch": 0.1193514757162886, + "loss/policy_avg": -0.03889453411102295, + "lr": 2.920628834355828e-06, + "objective/entropy": 3.4859085083007812, + "objective/kl": 7.356728553771973, + "objective/non_score_reward": -0.7356729507446289, + "objective/rlhf_reward": -1.3385716415086562, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 150.19898986816406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6149972677230835, + "step": 414, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001542568206787 + }, + { + "episode": 6656, + "epoch": 0.11963906963367725, + "loss/policy_avg": 0.07131887972354889, + "lr": 2.920437116564417e-06, + "objective/entropy": 135.44573974609375, + "objective/kl": 6.854515075683594, + "objective/non_score_reward": -0.6854515075683594, + "objective/rlhf_reward": -4.7418060302734375, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.425703048706055, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6121791005134583, + "step": 415, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997809886932373 + }, + { + "episode": 6672, + "epoch": 0.11992666355106589, + "loss/policy_avg": -0.02449220046401024, + "lr": 2.9202453987730062e-06, + "objective/entropy": -228.9163818359375, + "objective/kl": 3.998107433319092, + "objective/non_score_reward": -0.3998107314109802, + "objective/rlhf_reward": 0.8007570147514342, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.9464802742004395, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6649434566497803, + "step": 416, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998199939727783 + }, + { + "episode": 6688, + "epoch": 0.12021425746845454, + "loss/policy_avg": 0.1401190161705017, + "lr": 2.920053680981595e-06, + "objective/entropy": 109.60284423828125, + "objective/kl": 11.780817031860352, + "objective/non_score_reward": -1.178081750869751, + "objective/rlhf_reward": -6.712327003479004, + "objective/scores": -0.5, + "policy/approxkl_avg": 101.54756927490234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6730775833129883, + "step": 417, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986844062805176 + }, + { + "episode": 6704, + "epoch": 0.12050185138584318, + "loss/policy_avg": -0.057150907814502716, + "lr": 2.919861963190184e-06, + "objective/entropy": 114.74760437011719, + "objective/kl": 13.613632202148438, + "objective/non_score_reward": -1.361363172531128, + "objective/rlhf_reward": -7.44545316696167, + "objective/scores": -0.5, + "policy/approxkl_avg": 83.6063232421875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7602818012237549, + "step": 418, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99805748462677 + }, + { + "episode": 6720, + "epoch": 0.12078944530323184, + "loss/policy_avg": 0.33992117643356323, + "lr": 2.919670245398773e-06, + "objective/entropy": 95.6858139038086, + "objective/kl": 13.743134498596191, + "objective/non_score_reward": -1.3743133544921875, + "objective/rlhf_reward": -3.3745473048844676, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 54.830780029296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.745945930480957, + "step": 419, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975709915161133 + }, + { + "episode": 6736, + "epoch": 0.12107703922062048, + "loss/policy_avg": 0.04007640480995178, + "lr": 2.919478527607362e-06, + "objective/entropy": 158.9964599609375, + "objective/kl": 11.750536918640137, + "objective/non_score_reward": -1.175053596496582, + "objective/rlhf_reward": -4.300214721262455, + "objective/scores": 0.1, + "policy/approxkl_avg": 262.0268249511719, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.691627025604248, + "step": 420, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001215934753418 + }, + { + "episode": 6752, + "epoch": 0.12136463313800913, + "loss/policy_avg": 0.3095911741256714, + "lr": 2.919286809815951e-06, + "objective/entropy": -35.89824676513672, + "objective/kl": 11.786452293395996, + "objective/non_score_reward": -1.178645372390747, + "objective/rlhf_reward": -4.314581578969955, + "objective/scores": 0.1, + "policy/approxkl_avg": 36.27446746826172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8371065855026245, + "step": 421, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9973646402359009 + }, + { + "episode": 6768, + "epoch": 0.12165222705539779, + "loss/policy_avg": 0.22105564177036285, + "lr": 2.91909509202454e-06, + "objective/entropy": 15.297624588012695, + "objective/kl": 9.64991283416748, + "objective/non_score_reward": -0.9649913311004639, + "objective/rlhf_reward": -3.459965533018112, + "objective/scores": 0.1, + "policy/approxkl_avg": 37.608909606933594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6050717830657959, + "step": 422, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99948251247406 + }, + { + "episode": 6784, + "epoch": 0.12193982097278643, + "loss/policy_avg": 0.18998616933822632, + "lr": 2.9189033742331287e-06, + "objective/entropy": -28.578590393066406, + "objective/kl": 5.720818519592285, + "objective/non_score_reward": -0.5720819234848022, + "objective/rlhf_reward": -0.3409164053963978, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.9535547494888306, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5882730484008789, + "step": 423, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995017051696777 + }, + { + "episode": 6800, + "epoch": 0.12222741489017508, + "loss/policy_avg": 0.6313632726669312, + "lr": 2.918711656441718e-06, + "objective/entropy": 219.71212768554688, + "objective/kl": 10.877071380615234, + "objective/non_score_reward": -1.0877070426940918, + "objective/rlhf_reward": -6.350828170776367, + "objective/scores": -0.5, + "policy/approxkl_avg": 94.11060333251953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6537734270095825, + "step": 424, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975165128707886 + }, + { + "episode": 6816, + "epoch": 0.12251500880756372, + "loss/policy_avg": 0.024708323180675507, + "lr": 2.918519938650307e-06, + "objective/entropy": -79.56177520751953, + "objective/kl": 16.1546573638916, + "objective/non_score_reward": -1.615465760231018, + "objective/rlhf_reward": -2.0618630260229107, + "objective/scores": 1.1, + "policy/approxkl_avg": 151.0323028564453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7285983562469482, + "step": 425, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983255863189697 + }, + { + "episode": 6832, + "epoch": 0.12280260272495237, + "loss/policy_avg": 0.07644349336624146, + "lr": 2.918328220858896e-06, + "objective/entropy": -197.13235473632812, + "objective/kl": 7.779457092285156, + "objective/non_score_reward": -0.7779456973075867, + "objective/rlhf_reward": -1.5555234988599567, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 37.899497985839844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6256439685821533, + "step": 426, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981521368026733 + }, + { + "episode": 6848, + "epoch": 0.12309019664234101, + "loss/policy_avg": 0.26216840744018555, + "lr": 2.918136503067485e-06, + "objective/entropy": 4.240196228027344, + "objective/kl": 9.129347801208496, + "objective/non_score_reward": -0.9129348397254944, + "objective/rlhf_reward": -5.651739120483398, + "objective/scores": -0.5, + "policy/approxkl_avg": 51.532691955566406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6356798410415649, + "step": 427, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993443489074707 + }, + { + "episode": 6864, + "epoch": 0.12337779055972967, + "loss/policy_avg": 0.08391077816486359, + "lr": 2.9179447852760736e-06, + "objective/entropy": 116.065185546875, + "objective/kl": 12.789249420166016, + "objective/non_score_reward": -1.2789249420166016, + "objective/rlhf_reward": -4.71570006608963, + "objective/scores": 0.1, + "policy/approxkl_avg": 107.84225463867188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6037728786468506, + "step": 428, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9945564270019531 + }, + { + "episode": 6880, + "epoch": 0.1236653844771183, + "loss/policy_avg": 0.5531384944915771, + "lr": 2.917753067484663e-06, + "objective/entropy": 259.3071594238281, + "objective/kl": 14.592714309692383, + "objective/non_score_reward": -1.4592714309692383, + "objective/rlhf_reward": -4.175226414174425, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 109.08944702148438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6318076848983765, + "step": 429, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996614456176758 + }, + { + "episode": 6896, + "epoch": 0.12395297839450696, + "loss/policy_avg": 0.21222534775733948, + "lr": 2.9175613496932517e-06, + "objective/entropy": -58.68345642089844, + "objective/kl": 11.372722625732422, + "objective/non_score_reward": -1.1372722387313843, + "objective/rlhf_reward": -1.625370000244352, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 109.58206939697266, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.47258567810058594, + "step": 430, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987989664077759 + }, + { + "episode": 6912, + "epoch": 0.1242405723118956, + "loss/policy_avg": 0.22482730448246002, + "lr": 2.9173696319018405e-06, + "objective/entropy": 67.2789535522461, + "objective/kl": 9.9976806640625, + "objective/non_score_reward": -0.9997680187225342, + "objective/rlhf_reward": -3.5990721903741356, + "objective/scores": 0.1, + "policy/approxkl_avg": 59.97844696044922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5720049142837524, + "step": 431, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989635944366455 + }, + { + "episode": 6928, + "epoch": 0.12452816622928425, + "loss/policy_avg": 0.4322272539138794, + "lr": 2.9171779141104297e-06, + "objective/entropy": -290.14111328125, + "objective/kl": 14.833064079284668, + "objective/non_score_reward": -1.4833064079284668, + "objective/rlhf_reward": -4.108396942886422, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 118.15055847167969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7155340313911438, + "step": 432, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9969213008880615 + }, + { + "episode": 6944, + "epoch": 0.1248157601466729, + "loss/policy_avg": 0.8206951022148132, + "lr": 2.9169861963190185e-06, + "objective/entropy": 81.16971588134766, + "objective/kl": 12.39436149597168, + "objective/non_score_reward": -1.2394360303878784, + "objective/rlhf_reward": -0.55774433016777, + "objective/scores": 1.1, + "policy/approxkl_avg": 56.21806335449219, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9199215173721313, + "step": 433, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997793436050415 + }, + { + "episode": 6960, + "epoch": 0.12510335406406153, + "loss/policy_avg": 0.12932038307189941, + "lr": 2.9167944785276073e-06, + "objective/entropy": -108.72267150878906, + "objective/kl": 9.69011116027832, + "objective/non_score_reward": -0.969011127948761, + "objective/rlhf_reward": -3.476044631004333, + "objective/scores": 0.1, + "policy/approxkl_avg": 113.92694091796875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7485846281051636, + "step": 434, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000093936920166 + }, + { + "episode": 6976, + "epoch": 0.1253909479814502, + "loss/policy_avg": 0.25388216972351074, + "lr": 2.916602760736196e-06, + "objective/entropy": 131.2146453857422, + "objective/kl": 12.779239654541016, + "objective/non_score_reward": -1.2779240608215332, + "objective/rlhf_reward": -7.111696243286133, + "objective/scores": -0.5, + "policy/approxkl_avg": 195.51593017578125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3997848629951477, + "step": 435, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982001781463623 + }, + { + "episode": 6992, + "epoch": 0.12567854189883884, + "loss/policy_avg": 0.34683698415756226, + "lr": 2.9164110429447854e-06, + "objective/entropy": 94.69212341308594, + "objective/kl": 8.857991218566895, + "objective/non_score_reward": -0.8857991695404053, + "objective/rlhf_reward": -5.543196678161621, + "objective/scores": -0.5, + "policy/approxkl_avg": 67.66943359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6433195471763611, + "step": 436, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968899488449097 + }, + { + "episode": 7008, + "epoch": 0.12596613581622748, + "loss/policy_avg": 0.4975810945034027, + "lr": 2.916219325153374e-06, + "objective/entropy": 99.60516357421875, + "objective/kl": 17.621997833251953, + "objective/non_score_reward": -1.762199878692627, + "objective/rlhf_reward": -5.223970900448869, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 126.27960205078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9998393654823303, + "step": 437, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977905750274658 + }, + { + "episode": 7024, + "epoch": 0.12625372973361612, + "loss/policy_avg": -0.40271705389022827, + "lr": 2.916027607361963e-06, + "objective/entropy": -58.73802947998047, + "objective/kl": 11.266298294067383, + "objective/non_score_reward": -1.1266300678253174, + "objective/rlhf_reward": -2.68169152286918, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 36.971275329589844, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5164896249771118, + "step": 438, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0037713050842285 + }, + { + "episode": 7040, + "epoch": 0.1265413236510048, + "loss/policy_avg": 0.21917138993740082, + "lr": 2.9158358895705522e-06, + "objective/entropy": -138.54464721679688, + "objective/kl": 17.991540908813477, + "objective/non_score_reward": -1.799154281616211, + "objective/rlhf_reward": -5.073910774961982, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 85.51068878173828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4984607994556427, + "step": 439, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979517459869385 + }, + { + "episode": 7056, + "epoch": 0.12682891756839343, + "loss/policy_avg": 0.14226309955120087, + "lr": 2.915644171779141e-06, + "objective/entropy": -70.7157974243164, + "objective/kl": 17.338790893554688, + "objective/non_score_reward": -1.7338790893554688, + "objective/rlhf_reward": -4.535516625642776, + "objective/scores": 0.6, + "policy/approxkl_avg": 281.78021240234375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6945629119873047, + "step": 440, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976835250854492 + }, + { + "episode": 7072, + "epoch": 0.12711651148578207, + "loss/policy_avg": 0.7052698135375977, + "lr": 2.9154524539877303e-06, + "objective/entropy": -51.95797348022461, + "objective/kl": 11.16838264465332, + "objective/non_score_reward": -1.1168383359909058, + "objective/rlhf_reward": -4.067353239655494, + "objective/scores": 0.1, + "policy/approxkl_avg": 145.99929809570312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7760474681854248, + "step": 441, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9976942539215088 + }, + { + "episode": 7088, + "epoch": 0.12740410540317074, + "loss/policy_avg": 0.2039000242948532, + "lr": 2.915260736196319e-06, + "objective/entropy": 70.03821563720703, + "objective/kl": 10.011724472045898, + "objective/non_score_reward": -1.0011723041534424, + "objective/rlhf_reward": -3.604689425230026, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.66981506347656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5425417423248291, + "step": 442, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985206127166748 + }, + { + "episode": 7104, + "epoch": 0.12769169932055938, + "loss/policy_avg": 0.046909917145967484, + "lr": 2.915069018404908e-06, + "objective/entropy": -109.3448486328125, + "objective/kl": 10.856573104858398, + "objective/non_score_reward": -1.0856573581695557, + "objective/rlhf_reward": -3.9426295816898342, + "objective/scores": 0.1, + "policy/approxkl_avg": 99.86489868164062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5526108741760254, + "step": 443, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977734088897705 + }, + { + "episode": 7120, + "epoch": 0.12797929323794802, + "loss/policy_avg": 0.3472464084625244, + "lr": 2.914877300613497e-06, + "objective/entropy": 133.83642578125, + "objective/kl": 8.187213897705078, + "objective/non_score_reward": -0.8187214136123657, + "objective/rlhf_reward": -1.6130260578995808, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 60.895503997802734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6827129125595093, + "step": 444, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999275207519531 + }, + { + "episode": 7136, + "epoch": 0.12826688715533666, + "loss/policy_avg": 0.030289731919765472, + "lr": 2.914685582822086e-06, + "objective/entropy": -95.79446411132812, + "objective/kl": 12.967110633850098, + "objective/non_score_reward": -1.2967112064361572, + "objective/rlhf_reward": -4.786844870448112, + "objective/scores": 0.1, + "policy/approxkl_avg": 158.4815673828125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6717748641967773, + "step": 445, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999089241027832 + }, + { + "episode": 7152, + "epoch": 0.12855448107272532, + "loss/policy_avg": 0.3508151173591614, + "lr": 2.9144938650306748e-06, + "objective/entropy": -29.020809173583984, + "objective/kl": 10.402070045471191, + "objective/non_score_reward": -1.0402069091796875, + "objective/rlhf_reward": -1.2371087416422095, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 22.769393920898438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6285759210586548, + "step": 446, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999580383300781 + }, + { + "episode": 7168, + "epoch": 0.12884207499011396, + "loss/policy_avg": 0.5095102190971375, + "lr": 2.914302147239264e-06, + "objective/entropy": 4.9149627685546875, + "objective/kl": 11.270059585571289, + "objective/non_score_reward": -1.1270060539245605, + "objective/rlhf_reward": -0.10802436470985377, + "objective/scores": 1.1, + "policy/approxkl_avg": 42.081207275390625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6107115745544434, + "step": 447, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997491836547852 + }, + { + "episode": 7184, + "epoch": 0.1291296689075026, + "loss/policy_avg": 0.15434984862804413, + "lr": 2.914110429447853e-06, + "objective/entropy": 93.47442626953125, + "objective/kl": 5.827376842498779, + "objective/non_score_reward": -0.5827376842498779, + "objective/rlhf_reward": -4.330950736999512, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.859039306640625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5905852913856506, + "step": 448, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985287189483643 + }, + { + "episode": 7200, + "epoch": 0.12941726282489124, + "loss/policy_avg": 0.05858859419822693, + "lr": 2.913918711656442e-06, + "objective/entropy": -168.11392211914062, + "objective/kl": 12.220864295959473, + "objective/non_score_reward": -1.2220864295959473, + "objective/rlhf_reward": -6.888345718383789, + "objective/scores": -0.5, + "policy/approxkl_avg": 64.31013488769531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6738055348396301, + "step": 449, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984550476074219 + }, + { + "episode": 7216, + "epoch": 0.1297048567422799, + "loss/policy_avg": 0.6401174068450928, + "lr": 2.913726993865031e-06, + "objective/entropy": 143.6673126220703, + "objective/kl": 16.03870964050293, + "objective/non_score_reward": -1.6038709878921509, + "objective/rlhf_reward": -6.015483951568603, + "objective/scores": 0.1, + "policy/approxkl_avg": 215.43011474609375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.852541446685791, + "step": 450, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9969959259033203 + }, + { + "episode": 7232, + "epoch": 0.12999245065966855, + "loss/policy_avg": -0.20923450589179993, + "lr": 2.9135352760736196e-06, + "objective/entropy": -48.08766174316406, + "objective/kl": 3.6980080604553223, + "objective/non_score_reward": -0.36980074644088745, + "objective/rlhf_reward": -3.479203224182129, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.520061492919922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49264973402023315, + "step": 451, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0105478763580322 + }, + { + "episode": 7248, + "epoch": 0.1302800445770572, + "loss/policy_avg": 0.2910463511943817, + "lr": 2.913343558282209e-06, + "objective/entropy": -25.366256713867188, + "objective/kl": 18.143402099609375, + "objective/non_score_reward": -1.814340353012085, + "objective/rlhf_reward": -9.25736141204834, + "objective/scores": -0.5, + "policy/approxkl_avg": 229.06854248046875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5178902745246887, + "step": 452, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988359212875366 + }, + { + "episode": 7264, + "epoch": 0.13056763849444583, + "loss/policy_avg": 0.019646476954221725, + "lr": 2.9131518404907977e-06, + "objective/entropy": 163.14837646484375, + "objective/kl": 15.703920364379883, + "objective/non_score_reward": -1.5703920125961304, + "objective/rlhf_reward": -4.548234538237253, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 97.07273864746094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.629855751991272, + "step": 453, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000038146972656 + }, + { + "episode": 7280, + "epoch": 0.1308552324118345, + "loss/policy_avg": 0.03351786732673645, + "lr": 2.912960122699387e-06, + "objective/entropy": 106.53297424316406, + "objective/kl": 15.689282417297363, + "objective/non_score_reward": -1.5689281225204468, + "objective/rlhf_reward": -8.275712966918945, + "objective/scores": -0.5, + "policy/approxkl_avg": 168.91738891601562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5836728811264038, + "step": 454, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0003838539123535 + }, + { + "episode": 7296, + "epoch": 0.13114282632922314, + "loss/policy_avg": 0.025853008031845093, + "lr": 2.9127684049079757e-06, + "objective/entropy": -120.41839599609375, + "objective/kl": 14.210792541503906, + "objective/non_score_reward": -1.4210792779922485, + "objective/rlhf_reward": -4.022457575023758, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 25.721412658691406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5456880331039429, + "step": 455, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998733639717102 + }, + { + "episode": 7312, + "epoch": 0.13143042024661178, + "loss/policy_avg": 0.11350809037685394, + "lr": 2.912576687116564e-06, + "objective/entropy": 95.34707641601562, + "objective/kl": 14.430747985839844, + "objective/non_score_reward": -1.4430747032165527, + "objective/rlhf_reward": -5.3722985744476315, + "objective/scores": 0.1, + "policy/approxkl_avg": 132.41342163085938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5699273347854614, + "step": 456, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.995617151260376 + }, + { + "episode": 7328, + "epoch": 0.13171801416400042, + "loss/policy_avg": 0.5415328741073608, + "lr": 2.9123849693251534e-06, + "objective/entropy": -35.476009368896484, + "objective/kl": 8.802606582641602, + "objective/non_score_reward": -0.880260705947876, + "objective/rlhf_reward": -3.1210429728031155, + "objective/scores": 0.1, + "policy/approxkl_avg": 70.38357543945312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6030310988426208, + "step": 457, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000958204269409 + }, + { + "episode": 7344, + "epoch": 0.1320056080813891, + "loss/policy_avg": 0.18966403603553772, + "lr": 2.912193251533742e-06, + "objective/entropy": 160.48800659179688, + "objective/kl": 14.643467903137207, + "objective/non_score_reward": -1.4643468856811523, + "objective/rlhf_reward": -1.45738730430603, + "objective/scores": 1.1, + "policy/approxkl_avg": 121.34632110595703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7794003486633301, + "step": 458, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9975476264953613 + }, + { + "episode": 7360, + "epoch": 0.13229320199877773, + "loss/policy_avg": 0.038652483373880386, + "lr": 2.9120015337423314e-06, + "objective/entropy": -118.91299438476562, + "objective/kl": 8.336200714111328, + "objective/non_score_reward": -0.8336200714111328, + "objective/rlhf_reward": -1.7303604072967347, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 37.65449523925781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7707014083862305, + "step": 459, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998978614807129 + }, + { + "episode": 7376, + "epoch": 0.13258079591616637, + "loss/policy_avg": 0.2408444583415985, + "lr": 2.91180981595092e-06, + "objective/entropy": 8.406875610351562, + "objective/kl": 12.288520812988281, + "objective/non_score_reward": -1.2288521528244019, + "objective/rlhf_reward": -6.915408134460449, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.923194885253906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7405025959014893, + "step": 460, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999840497970581 + }, + { + "episode": 7392, + "epoch": 0.13286838983355503, + "loss/policy_avg": 0.15941619873046875, + "lr": 2.911618098159509e-06, + "objective/entropy": 113.75840759277344, + "objective/kl": 7.790184020996094, + "objective/non_score_reward": -0.7790184617042542, + "objective/rlhf_reward": -2.7160738766193386, + "objective/scores": 0.1, + "policy/approxkl_avg": 42.8814697265625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7081342935562134, + "step": 461, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986152648925781 + }, + { + "episode": 7408, + "epoch": 0.13315598375094367, + "loss/policy_avg": 0.012417584657669067, + "lr": 2.9114263803680982e-06, + "objective/entropy": -38.02949142456055, + "objective/kl": 7.286294937133789, + "objective/non_score_reward": -0.728629469871521, + "objective/rlhf_reward": -4.914518356323242, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.29476547241211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.38131576776504517, + "step": 462, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974234104156494 + }, + { + "episode": 7424, + "epoch": 0.1334435776683323, + "loss/policy_avg": -0.01223007496446371, + "lr": 2.911234662576687e-06, + "objective/entropy": -89.77444458007812, + "objective/kl": 6.793033599853516, + "objective/non_score_reward": -0.6793034076690674, + "objective/rlhf_reward": -2.3172135859727856, + "objective/scores": 0.1, + "policy/approxkl_avg": 0.7712490558624268, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5063762664794922, + "step": 463, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0153064727783203 + }, + { + "episode": 7440, + "epoch": 0.13373117158572095, + "loss/policy_avg": 0.7638596892356873, + "lr": 2.9110429447852763e-06, + "objective/entropy": -124.14325714111328, + "objective/kl": 12.126808166503906, + "objective/non_score_reward": -1.2126808166503906, + "objective/rlhf_reward": -4.450723177194595, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.479761123657227, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6185396313667297, + "step": 464, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9997889995574951 + }, + { + "episode": 7456, + "epoch": 0.13401876550310962, + "loss/policy_avg": -0.23089107871055603, + "lr": 2.910851226993865e-06, + "objective/entropy": 12.087081909179688, + "objective/kl": 9.909775733947754, + "objective/non_score_reward": -0.9909776449203491, + "objective/rlhf_reward": 0.4360893458127979, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.47049713134766, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6658675670623779, + "step": 465, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0070953369140625 + }, + { + "episode": 7472, + "epoch": 0.13430635942049826, + "loss/policy_avg": 0.0948818027973175, + "lr": 2.910659509202454e-06, + "objective/entropy": 121.13216400146484, + "objective/kl": 3.1745963096618652, + "objective/non_score_reward": -0.3174596130847931, + "objective/rlhf_reward": 0.33428153776733316, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.6572422981262207, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6701205372810364, + "step": 466, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000531673431396 + }, + { + "episode": 7488, + "epoch": 0.1345939533378869, + "loss/policy_avg": 0.06078179180622101, + "lr": 2.910467791411043e-06, + "objective/entropy": 109.86882019042969, + "objective/kl": 11.02462387084961, + "objective/non_score_reward": -1.1024622917175293, + "objective/rlhf_reward": -4.009849047660827, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.87171936035156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5864859819412231, + "step": 467, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973387718200684 + }, + { + "episode": 7504, + "epoch": 0.13488154725527554, + "loss/policy_avg": 0.3744744658470154, + "lr": 2.910276073619632e-06, + "objective/entropy": 224.23721313476562, + "objective/kl": 11.501972198486328, + "objective/non_score_reward": -1.1501970291137695, + "objective/rlhf_reward": -4.200788414478302, + "objective/scores": 0.1, + "policy/approxkl_avg": 118.61778259277344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9559670090675354, + "step": 468, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999631643295288 + }, + { + "episode": 7520, + "epoch": 0.1351691411726642, + "loss/policy_avg": 0.3056102395057678, + "lr": 2.9100843558282208e-06, + "objective/entropy": -3.655149459838867, + "objective/kl": 9.39276123046875, + "objective/non_score_reward": -0.9392762184143066, + "objective/rlhf_reward": 0.642895066738129, + "objective/scores": 1.1, + "policy/approxkl_avg": 105.86042785644531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5348349809646606, + "step": 469, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986598491668701 + }, + { + "episode": 7536, + "epoch": 0.13545673509005285, + "loss/policy_avg": 0.26237964630126953, + "lr": 2.90989263803681e-06, + "objective/entropy": -334.96240234375, + "objective/kl": 11.451269149780273, + "objective/non_score_reward": -1.1451269388198853, + "objective/rlhf_reward": -0.18050771057605708, + "objective/scores": 1.1, + "policy/approxkl_avg": 53.953277587890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8247783184051514, + "step": 470, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9965124130249023 + }, + { + "episode": 7552, + "epoch": 0.1357443290074415, + "loss/policy_avg": 0.2353116273880005, + "lr": 2.909700920245399e-06, + "objective/entropy": -169.8182373046875, + "objective/kl": 14.575862884521484, + "objective/non_score_reward": -1.4575862884521484, + "objective/rlhf_reward": -1.430345384776592, + "objective/scores": 1.1, + "policy/approxkl_avg": 134.40069580078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6239136457443237, + "step": 471, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998337745666504 + }, + { + "episode": 7568, + "epoch": 0.13603192292483013, + "loss/policy_avg": 0.39453232288360596, + "lr": 2.909509202453988e-06, + "objective/entropy": 88.90142822265625, + "objective/kl": 10.912429809570312, + "objective/non_score_reward": -1.091243028640747, + "objective/rlhf_reward": 0.03502755761146581, + "objective/scores": 1.1, + "policy/approxkl_avg": 41.724300384521484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8498834371566772, + "step": 472, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997682571411133 + }, + { + "episode": 7584, + "epoch": 0.1363195168422188, + "loss/policy_avg": 0.8089221119880676, + "lr": 2.909317484662577e-06, + "objective/entropy": 245.41949462890625, + "objective/kl": 19.53654670715332, + "objective/non_score_reward": -1.9536547660827637, + "objective/rlhf_reward": -6.258360086885049, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 363.8136291503906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6021559834480286, + "step": 473, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999751091003418 + }, + { + "episode": 7600, + "epoch": 0.13660711075960744, + "loss/policy_avg": 0.5075941681861877, + "lr": 2.9091257668711657e-06, + "objective/entropy": -48.53450012207031, + "objective/kl": 5.752985000610352, + "objective/non_score_reward": -0.5752984285354614, + "objective/rlhf_reward": 2.098806151747704, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.8740129470825195, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.45245441794395447, + "step": 474, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002760887145996 + }, + { + "episode": 7616, + "epoch": 0.13689470467699608, + "loss/policy_avg": 0.11127430945634842, + "lr": 2.908934049079755e-06, + "objective/entropy": -34.70821762084961, + "objective/kl": 13.361335754394531, + "objective/non_score_reward": -1.336133599281311, + "objective/rlhf_reward": -3.2218282840409618, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 80.18733978271484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6081266403198242, + "step": 475, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994287490844727 + }, + { + "episode": 7632, + "epoch": 0.13718229859438472, + "loss/policy_avg": 0.10813181102275848, + "lr": 2.9087423312883437e-06, + "objective/entropy": 111.931640625, + "objective/kl": 6.184762954711914, + "objective/non_score_reward": -0.6184762716293335, + "objective/rlhf_reward": -0.6490763976898899, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 10.89632511138916, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4316413700580597, + "step": 476, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000701904296875 + }, + { + "episode": 7648, + "epoch": 0.13746989251177338, + "loss/policy_avg": 0.4999096989631653, + "lr": 2.908550613496933e-06, + "objective/entropy": -403.03533935546875, + "objective/kl": 9.01245403289795, + "objective/non_score_reward": -0.9012453556060791, + "objective/rlhf_reward": -5.604981422424316, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.18182373046875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5862891674041748, + "step": 477, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975061416625977 + }, + { + "episode": 7664, + "epoch": 0.13775748642916202, + "loss/policy_avg": 0.14539723098278046, + "lr": 2.9083588957055213e-06, + "objective/entropy": -126.68431091308594, + "objective/kl": 18.38888168334961, + "objective/non_score_reward": -1.8388880491256714, + "objective/rlhf_reward": -9.355552673339844, + "objective/scores": -0.5, + "policy/approxkl_avg": 197.85671997070312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6251143217086792, + "step": 478, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994767904281616 + }, + { + "episode": 7680, + "epoch": 0.13804508034655066, + "loss/policy_avg": 0.0330502912402153, + "lr": 2.9081671779141105e-06, + "objective/entropy": -44.419708251953125, + "objective/kl": 15.150094985961914, + "objective/non_score_reward": -1.5150094032287598, + "objective/rlhf_reward": -8.060037612915039, + "objective/scores": -0.5, + "policy/approxkl_avg": 151.68356323242188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7994130253791809, + "step": 479, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995399713516235 + }, + { + "episode": 7696, + "epoch": 0.13833267426393933, + "loss/policy_avg": 0.14779314398765564, + "lr": 2.9079754601226994e-06, + "objective/entropy": 105.36286926269531, + "objective/kl": 8.258722305297852, + "objective/non_score_reward": -0.8258723616600037, + "objective/rlhf_reward": -2.9034894019365307, + "objective/scores": 0.1, + "policy/approxkl_avg": 67.71802520751953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5357914566993713, + "step": 480, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9967106580734253 + }, + { + "episode": 7712, + "epoch": 0.13862026818132797, + "loss/policy_avg": 0.27561917901039124, + "lr": 2.907783742331288e-06, + "objective/entropy": -65.93132019042969, + "objective/kl": 11.474952697753906, + "objective/non_score_reward": -1.1474955081939697, + "objective/rlhf_reward": -6.589982032775879, + "objective/scores": -0.5, + "policy/approxkl_avg": 69.8592529296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.572990894317627, + "step": 481, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971063137054443 + }, + { + "episode": 7728, + "epoch": 0.1389078620987166, + "loss/policy_avg": 0.31458884477615356, + "lr": 2.9075920245398774e-06, + "objective/entropy": -12.270210266113281, + "objective/kl": 11.645172119140625, + "objective/non_score_reward": -1.1645172834396362, + "objective/rlhf_reward": -1.7343501045715537, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 109.0627670288086, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.47612980008125305, + "step": 482, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9962890148162842 + }, + { + "episode": 7744, + "epoch": 0.13919545601610525, + "loss/policy_avg": 0.6003249883651733, + "lr": 2.907400306748466e-06, + "objective/entropy": 110.20198059082031, + "objective/kl": 10.82795524597168, + "objective/non_score_reward": -1.0827956199645996, + "objective/rlhf_reward": -6.331182479858398, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.25099182128906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49656587839126587, + "step": 483, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974627494812012 + }, + { + "episode": 7760, + "epoch": 0.13948304993349392, + "loss/policy_avg": -0.30824440717697144, + "lr": 2.907208588957055e-06, + "objective/entropy": 195.908447265625, + "objective/kl": 11.146462440490723, + "objective/non_score_reward": -1.1146461963653564, + "objective/rlhf_reward": -4.058584606647491, + "objective/scores": 0.1, + "policy/approxkl_avg": 49.840267181396484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7134747505187988, + "step": 484, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003911256790161 + }, + { + "episode": 7776, + "epoch": 0.13977064385088256, + "loss/policy_avg": 0.2714402973651886, + "lr": 2.9070168711656443e-06, + "objective/entropy": 200.92269897460938, + "objective/kl": 9.644775390625, + "objective/non_score_reward": -0.9644776582717896, + "objective/rlhf_reward": 0.5420892477035526, + "objective/scores": 1.1, + "policy/approxkl_avg": 84.43876647949219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5147348642349243, + "step": 485, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978854656219482 + }, + { + "episode": 7792, + "epoch": 0.1400582377682712, + "loss/policy_avg": 0.06527984887361526, + "lr": 2.906825153374233e-06, + "objective/entropy": 108.83843994140625, + "objective/kl": 6.777806282043457, + "objective/non_score_reward": -0.6777806282043457, + "objective/rlhf_reward": 1.6888774499297146, + "objective/scores": 1.1, + "policy/approxkl_avg": 19.67444610595703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6244113445281982, + "step": 486, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988651275634766 + }, + { + "episode": 7808, + "epoch": 0.14034583168565984, + "loss/policy_avg": 0.16577281057834625, + "lr": 2.9066334355828223e-06, + "objective/entropy": 89.5904541015625, + "objective/kl": 5.467947483062744, + "objective/non_score_reward": -0.5467947125434875, + "objective/rlhf_reward": -1.7871789395809172, + "objective/scores": 0.1, + "policy/approxkl_avg": 26.6594181060791, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.589790940284729, + "step": 487, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983232021331787 + }, + { + "episode": 7824, + "epoch": 0.1406334256030485, + "loss/policy_avg": 0.04526926949620247, + "lr": 2.906441717791411e-06, + "objective/entropy": -265.4620056152344, + "objective/kl": 5.726231575012207, + "objective/non_score_reward": -0.5726232528686523, + "objective/rlhf_reward": 2.109507152438164, + "objective/scores": 1.1, + "policy/approxkl_avg": 27.101959228515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5848753452301025, + "step": 488, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989309310913086 + }, + { + "episode": 7840, + "epoch": 0.14092101952043715, + "loss/policy_avg": 0.4120207726955414, + "lr": 2.90625e-06, + "objective/entropy": -195.201171875, + "objective/kl": 11.044137001037598, + "objective/non_score_reward": -1.104413628578186, + "objective/rlhf_reward": -4.017654529213905, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.49024200439453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6557003259658813, + "step": 489, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995417594909668 + }, + { + "episode": 7856, + "epoch": 0.1412086134378258, + "loss/policy_avg": 0.07290078699588776, + "lr": 2.906058282208589e-06, + "objective/entropy": -118.10462951660156, + "objective/kl": 13.856963157653809, + "objective/non_score_reward": -1.3856964111328125, + "objective/rlhf_reward": -5.142785763740539, + "objective/scores": 0.1, + "policy/approxkl_avg": 48.285400390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5914438366889954, + "step": 490, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999072790145874 + }, + { + "episode": 7872, + "epoch": 0.14149620735521443, + "loss/policy_avg": 0.19345812499523163, + "lr": 2.905866564417178e-06, + "objective/entropy": -72.50758361816406, + "objective/kl": 10.323736190795898, + "objective/non_score_reward": -1.0323736667633057, + "objective/rlhf_reward": -3.7294944696128365, + "objective/scores": 0.1, + "policy/approxkl_avg": 89.49455261230469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6444407105445862, + "step": 491, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993422031402588 + }, + { + "episode": 7888, + "epoch": 0.1417838012726031, + "loss/policy_avg": 0.2960240840911865, + "lr": 2.905674846625767e-06, + "objective/entropy": -52.92985534667969, + "objective/kl": 10.259614944458008, + "objective/non_score_reward": -1.0259615182876587, + "objective/rlhf_reward": 0.2961540907621387, + "objective/scores": 1.1, + "policy/approxkl_avg": 101.89286804199219, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6554427742958069, + "step": 492, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968318939208984 + }, + { + "episode": 7904, + "epoch": 0.14207139518999173, + "loss/policy_avg": 0.18822041153907776, + "lr": 2.905483128834356e-06, + "objective/entropy": 212.44216918945312, + "objective/kl": 15.652924537658691, + "objective/non_score_reward": -1.5652923583984375, + "objective/rlhf_reward": -4.599310164869415, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 205.89048767089844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5811524987220764, + "step": 493, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9962797164916992 + }, + { + "episode": 7920, + "epoch": 0.14235898910738037, + "loss/policy_avg": 0.12098196893930435, + "lr": 2.905291411042945e-06, + "objective/entropy": 153.53717041015625, + "objective/kl": 20.49001693725586, + "objective/non_score_reward": -2.049001693725586, + "objective/rlhf_reward": -6.073301257864509, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 77.01814270019531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7395473718643188, + "step": 494, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977500438690186 + }, + { + "episode": 7936, + "epoch": 0.142646583024769, + "loss/policy_avg": -0.01829097419977188, + "lr": 2.905099693251534e-06, + "objective/entropy": 66.50902557373047, + "objective/kl": 10.364259719848633, + "objective/non_score_reward": -1.036426067352295, + "objective/rlhf_reward": -1.2219853147279947, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 26.25365447998047, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4803291857242584, + "step": 495, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991976022720337 + }, + { + "episode": 7952, + "epoch": 0.14293417694215768, + "loss/policy_avg": 0.10506206750869751, + "lr": 2.904907975460123e-06, + "objective/entropy": 136.27267456054688, + "objective/kl": 6.317910194396973, + "objective/non_score_reward": -0.6317909955978394, + "objective/rlhf_reward": -4.527163982391357, + "objective/scores": -0.5, + "policy/approxkl_avg": 16.371524810791016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6706969738006592, + "step": 496, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975488185882568 + }, + { + "episode": 7968, + "epoch": 0.14322177085954632, + "loss/policy_avg": 0.4831598401069641, + "lr": 2.9047162576687117e-06, + "objective/entropy": -31.924362182617188, + "objective/kl": 11.602783203125, + "objective/non_score_reward": -1.1602783203125, + "objective/rlhf_reward": -4.2411130428314205, + "objective/scores": 0.1, + "policy/approxkl_avg": 87.01671600341797, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7876095771789551, + "step": 497, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99977707862854 + }, + { + "episode": 7984, + "epoch": 0.14350936477693496, + "loss/policy_avg": 1.053896427154541, + "lr": 2.904524539877301e-06, + "objective/entropy": 100.03763580322266, + "objective/kl": 9.37060260772705, + "objective/non_score_reward": -0.9370602369308472, + "objective/rlhf_reward": -1.6255346558251715, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.041189193725586, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6169747114181519, + "step": 498, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0003209114074707 + }, + { + "episode": 8000, + "epoch": 0.14379695869432363, + "loss/policy_avg": 0.15767307579517365, + "lr": 2.9043328220858897e-06, + "objective/entropy": 32.90543746948242, + "objective/kl": 7.48246955871582, + "objective/non_score_reward": -0.7482469081878662, + "objective/rlhf_reward": -1.3311283044224842, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 31.293258666992188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7192215919494629, + "step": 499, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001739978790283 + }, + { + "episode": 8016, + "epoch": 0.14408455261171227, + "loss/policy_avg": 0.3422059416770935, + "lr": 2.904141104294479e-06, + "objective/entropy": -277.40203857421875, + "objective/kl": 13.774396896362305, + "objective/non_score_reward": -1.3774398565292358, + "objective/rlhf_reward": -7.509759426116943, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.98161315917969, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6245834827423096, + "step": 500, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9970264434814453 + }, + { + "episode": 8032, + "epoch": 0.1443721465291009, + "loss/policy_avg": 0.4801827669143677, + "lr": 2.9039493865030673e-06, + "objective/entropy": 83.53890228271484, + "objective/kl": 8.603704452514648, + "objective/non_score_reward": -0.8603705167770386, + "objective/rlhf_reward": -0.5177629336130348, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 94.87248992919922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7735698223114014, + "step": 501, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000962257385254 + }, + { + "episode": 8048, + "epoch": 0.14465974044648955, + "loss/policy_avg": 0.1936042606830597, + "lr": 2.9037576687116566e-06, + "objective/entropy": 16.39226531982422, + "objective/kl": 13.502885818481445, + "objective/non_score_reward": -1.3502888679504395, + "objective/rlhf_reward": -3.453743944840367, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 83.78414916992188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7048830986022949, + "step": 502, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994542598724365 + }, + { + "episode": 8064, + "epoch": 0.14494733436387822, + "loss/policy_avg": 0.1588769406080246, + "lr": 2.9035659509202454e-06, + "objective/entropy": -40.00954055786133, + "objective/kl": 18.89105987548828, + "objective/non_score_reward": -1.8891057968139648, + "objective/rlhf_reward": -4.632704351783964, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 181.8993682861328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5775229930877686, + "step": 503, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997877836227417 + }, + { + "episode": 8080, + "epoch": 0.14523492828126686, + "loss/policy_avg": 0.6248252987861633, + "lr": 2.903374233128834e-06, + "objective/entropy": -210.8176727294922, + "objective/kl": 10.124932289123535, + "objective/non_score_reward": -1.0124932527542114, + "objective/rlhf_reward": -3.649973204731941, + "objective/scores": 0.1, + "policy/approxkl_avg": 49.988563537597656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6010805368423462, + "step": 504, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989054203033447 + }, + { + "episode": 8096, + "epoch": 0.1455225221986555, + "loss/policy_avg": 0.1622433364391327, + "lr": 2.9031825153374234e-06, + "objective/entropy": 17.784866333007812, + "objective/kl": 9.719334602355957, + "objective/non_score_reward": -0.9719333648681641, + "objective/rlhf_reward": 0.5122664213180546, + "objective/scores": 1.1, + "policy/approxkl_avg": 56.30583953857422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6292353272438049, + "step": 505, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0021533966064453 + }, + { + "episode": 8112, + "epoch": 0.14581011611604414, + "loss/policy_avg": -0.1938256323337555, + "lr": 2.9029907975460122e-06, + "objective/entropy": 127.38702392578125, + "objective/kl": 5.040970802307129, + "objective/non_score_reward": -0.5040971636772156, + "objective/rlhf_reward": -1.6163885876536368, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.422080993652344, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5696605443954468, + "step": 506, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0037827491760254 + }, + { + "episode": 8128, + "epoch": 0.1460977100334328, + "loss/policy_avg": 0.38081973791122437, + "lr": 2.902799079754601e-06, + "objective/entropy": 42.37715530395508, + "objective/kl": 7.989552021026611, + "objective/non_score_reward": -0.7989552021026611, + "objective/rlhf_reward": -1.073114486710105, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 50.31044006347656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6788329482078552, + "step": 507, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99943208694458 + }, + { + "episode": 8144, + "epoch": 0.14638530395082144, + "loss/policy_avg": 0.3784164786338806, + "lr": 2.9026073619631903e-06, + "objective/entropy": -50.67826843261719, + "objective/kl": 13.639227867126465, + "objective/non_score_reward": -1.3639228343963623, + "objective/rlhf_reward": -7.455691337585449, + "objective/scores": -0.5, + "policy/approxkl_avg": 109.05412292480469, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8064265847206116, + "step": 508, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993607997894287 + }, + { + "episode": 8160, + "epoch": 0.14667289786821008, + "loss/policy_avg": 0.13403823971748352, + "lr": 2.902415644171779e-06, + "objective/entropy": 96.67460632324219, + "objective/kl": 12.861349105834961, + "objective/non_score_reward": -1.2861348390579224, + "objective/rlhf_reward": -3.540419537488537, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 196.5169677734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47650325298309326, + "step": 509, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008530616760254 + }, + { + "episode": 8176, + "epoch": 0.14696049178559872, + "loss/policy_avg": 0.2548585534095764, + "lr": 2.9022239263803683e-06, + "objective/entropy": -259.3787841796875, + "objective/kl": 13.765344619750977, + "objective/non_score_reward": -1.376534342765808, + "objective/rlhf_reward": -5.106137281656265, + "objective/scores": 0.1, + "policy/approxkl_avg": 124.93861389160156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.578956663608551, + "step": 510, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9970561265945435 + }, + { + "episode": 8192, + "epoch": 0.1472480857029874, + "loss/policy_avg": 0.32806700468063354, + "lr": 2.902032208588957e-06, + "objective/entropy": -89.41710662841797, + "objective/kl": 11.446915626525879, + "objective/non_score_reward": -1.1446915864944458, + "objective/rlhf_reward": -2.7539375975456943, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 100.32626342773438, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6500498056411743, + "step": 511, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999818801879883 + }, + { + "episode": 8208, + "epoch": 0.14753567962037603, + "loss/policy_avg": 0.1273409128189087, + "lr": 2.901840490797546e-06, + "objective/entropy": 69.84151458740234, + "objective/kl": 18.043598175048828, + "objective/non_score_reward": -1.804359793663025, + "objective/rlhf_reward": -4.8174391150474545, + "objective/scores": 0.6, + "policy/approxkl_avg": 85.71723937988281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5488656163215637, + "step": 512, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998365879058838 + }, + { + "episode": 8224, + "epoch": 0.14782327353776467, + "loss/policy_avg": 1.1218316555023193, + "lr": 2.901648773006135e-06, + "objective/entropy": -103.88362121582031, + "objective/kl": 12.342060089111328, + "objective/non_score_reward": -1.2342060804367065, + "objective/rlhf_reward": -4.536824202537536, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.8322827816009521, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.735711932182312, + "step": 513, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000443458557129 + }, + { + "episode": 8240, + "epoch": 0.1481108674551533, + "loss/policy_avg": -0.014876842498779297, + "lr": 2.901457055214724e-06, + "objective/entropy": 75.55338287353516, + "objective/kl": 12.932259559631348, + "objective/non_score_reward": -1.2932257652282715, + "objective/rlhf_reward": -3.439569846789042, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 28.75712776184082, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.526807963848114, + "step": 514, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995429515838623 + }, + { + "episode": 8256, + "epoch": 0.14839846137254198, + "loss/policy_avg": 0.11381683498620987, + "lr": 2.901265337423313e-06, + "objective/entropy": 28.295608520507812, + "objective/kl": 12.695294380187988, + "objective/non_score_reward": -1.2695293426513672, + "objective/rlhf_reward": -4.678117549419403, + "objective/scores": 0.1, + "policy/approxkl_avg": 62.5381965637207, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4214191138744354, + "step": 515, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000277519226074 + }, + { + "episode": 8272, + "epoch": 0.14868605528993062, + "loss/policy_avg": 0.031907178461551666, + "lr": 2.901073619631902e-06, + "objective/entropy": 202.26161193847656, + "objective/kl": 7.357514381408691, + "objective/non_score_reward": -0.7357515096664429, + "objective/rlhf_reward": -2.5430058300495144, + "objective/scores": 0.1, + "policy/approxkl_avg": 9.950460433959961, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7936433553695679, + "step": 516, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0013363361358643 + }, + { + "episode": 8288, + "epoch": 0.14897364920731926, + "loss/policy_avg": 0.1753254234790802, + "lr": 2.900881901840491e-06, + "objective/entropy": -103.65457153320312, + "objective/kl": 12.325798034667969, + "objective/non_score_reward": -1.2325799465179443, + "objective/rlhf_reward": -2.807613672987495, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 124.17157745361328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6873651742935181, + "step": 517, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0010011196136475 + }, + { + "episode": 8304, + "epoch": 0.1492612431247079, + "loss/policy_avg": 0.49640148878097534, + "lr": 2.90069018404908e-06, + "objective/entropy": -23.451457977294922, + "objective/kl": 11.856928825378418, + "objective/non_score_reward": -1.1856927871704102, + "objective/rlhf_reward": -2.7953601283597305, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 100.8171157836914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.530035138130188, + "step": 518, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997998595237732 + }, + { + "episode": 8320, + "epoch": 0.14954883704209657, + "loss/policy_avg": -0.026080047711730003, + "lr": 2.900498466257669e-06, + "objective/entropy": -170.7906951904297, + "objective/kl": 5.801922798156738, + "objective/non_score_reward": -0.5801923871040344, + "objective/rlhf_reward": 2.0792303770780567, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.545293807983398, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4394223093986511, + "step": 519, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988439083099365 + }, + { + "episode": 8336, + "epoch": 0.1498364309594852, + "loss/policy_avg": 0.18013785779476166, + "lr": 2.9003067484662577e-06, + "objective/entropy": -30.96051788330078, + "objective/kl": 15.340864181518555, + "objective/non_score_reward": -1.5340864658355713, + "objective/rlhf_reward": -5.736345967650413, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.957459449768066, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7148102521896362, + "step": 520, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0002388954162598 + }, + { + "episode": 8352, + "epoch": 0.15012402487687385, + "loss/policy_avg": 0.21995435655117035, + "lr": 2.900115030674847e-06, + "objective/entropy": 118.80613708496094, + "objective/kl": 13.875422477722168, + "objective/non_score_reward": -1.3875422477722168, + "objective/rlhf_reward": -5.150168991088867, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.807886123657227, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6429756879806519, + "step": 521, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986985921859741 + }, + { + "episode": 8368, + "epoch": 0.1504116187942625, + "loss/policy_avg": 0.44822782278060913, + "lr": 2.8999233128834357e-06, + "objective/entropy": 131.02780151367188, + "objective/kl": 8.63813304901123, + "objective/non_score_reward": -0.8638133406639099, + "objective/rlhf_reward": -3.0552533924579617, + "objective/scores": 0.1, + "policy/approxkl_avg": 54.1556510925293, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8205841779708862, + "step": 522, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984720945358276 + }, + { + "episode": 8384, + "epoch": 0.15069921271165115, + "loss/policy_avg": 0.2645617723464966, + "lr": 2.8997315950920245e-06, + "objective/entropy": -156.31088256835938, + "objective/kl": 7.035023212432861, + "objective/non_score_reward": -0.7035022974014282, + "objective/rlhf_reward": -4.814008712768555, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.48949432373047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49438732862472534, + "step": 523, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9992444515228271 + }, + { + "episode": 8400, + "epoch": 0.1509868066290398, + "loss/policy_avg": 0.09130215644836426, + "lr": 2.8995398773006133e-06, + "objective/entropy": -78.74007415771484, + "objective/kl": 8.69231128692627, + "objective/non_score_reward": -0.8692312240600586, + "objective/rlhf_reward": -0.5532056882393088, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 96.52299499511719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46065402030944824, + "step": 524, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998188853263855 + }, + { + "episode": 8416, + "epoch": 0.15127440054642843, + "loss/policy_avg": 0.13580232858657837, + "lr": 2.8993481595092026e-06, + "objective/entropy": -72.97970581054688, + "objective/kl": 7.367366313934326, + "objective/non_score_reward": -0.7367366552352905, + "objective/rlhf_reward": -4.946946620941162, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.75210189819336, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7579455375671387, + "step": 525, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000337600708008 + }, + { + "episode": 8432, + "epoch": 0.1515619944638171, + "loss/policy_avg": 0.11613625288009644, + "lr": 2.8991564417177914e-06, + "objective/entropy": 83.00586700439453, + "objective/kl": 12.637749671936035, + "objective/non_score_reward": -1.2637748718261719, + "objective/rlhf_reward": -3.2302709474888553, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 89.52025604248047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5902446508407593, + "step": 526, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989619255065918 + }, + { + "episode": 8448, + "epoch": 0.15184958838120574, + "loss/policy_avg": 0.03080570697784424, + "lr": 2.89896472392638e-06, + "objective/entropy": 106.05413055419922, + "objective/kl": 8.069011688232422, + "objective/non_score_reward": -0.8069012761116028, + "objective/rlhf_reward": -0.3038861199629035, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 105.38276672363281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5903005599975586, + "step": 527, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979126453399658 + }, + { + "episode": 8464, + "epoch": 0.15213718229859438, + "loss/policy_avg": 0.3046284317970276, + "lr": 2.8987730061349694e-06, + "objective/entropy": 152.33953857421875, + "objective/kl": 14.629440307617188, + "objective/non_score_reward": -1.4629439115524292, + "objective/rlhf_reward": -4.189916288078415, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 60.608787536621094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6111043691635132, + "step": 528, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986634254455566 + }, + { + "episode": 8480, + "epoch": 0.15242477621598302, + "loss/policy_avg": 0.3153807520866394, + "lr": 2.8985812883435582e-06, + "objective/entropy": 11.483592987060547, + "objective/kl": 7.498883247375488, + "objective/non_score_reward": -0.7498883008956909, + "objective/rlhf_reward": -2.5995532035827633, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.66455841064453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.530735969543457, + "step": 529, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997819185256958 + }, + { + "episode": 8496, + "epoch": 0.1527123701333717, + "loss/policy_avg": -0.4344179630279541, + "lr": 2.8983895705521475e-06, + "objective/entropy": 172.50282287597656, + "objective/kl": 9.609460830688477, + "objective/non_score_reward": -0.9609460234642029, + "objective/rlhf_reward": -5.843784332275391, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.63162231445312, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8059056997299194, + "step": 530, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002572536468506 + }, + { + "episode": 8512, + "epoch": 0.15299996405076033, + "loss/policy_avg": 0.09429708868265152, + "lr": 2.8981978527607363e-06, + "objective/entropy": 294.640625, + "objective/kl": 18.882694244384766, + "objective/non_score_reward": -1.888269305229187, + "objective/rlhf_reward": -7.153077340126037, + "objective/scores": 0.1, + "policy/approxkl_avg": 111.17985534667969, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7101346254348755, + "step": 531, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989078044891357 + }, + { + "episode": 8528, + "epoch": 0.15328755796814897, + "loss/policy_avg": 0.15176677703857422, + "lr": 2.898006134969325e-06, + "objective/entropy": 136.55715942382812, + "objective/kl": 12.547300338745117, + "objective/non_score_reward": -1.2547301054000854, + "objective/rlhf_reward": -0.6189204812049862, + "objective/scores": 1.1, + "policy/approxkl_avg": 103.96243286132812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8805792331695557, + "step": 532, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000082015991211 + }, + { + "episode": 8544, + "epoch": 0.1535751518855376, + "loss/policy_avg": 0.2759408950805664, + "lr": 2.8978144171779143e-06, + "objective/entropy": 157.77984619140625, + "objective/kl": 8.116591453552246, + "objective/non_score_reward": -0.8116590976715088, + "objective/rlhf_reward": -5.246636390686035, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.0709867477417, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7335650324821472, + "step": 533, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999115228652954 + }, + { + "episode": 8560, + "epoch": 0.15386274580292628, + "loss/policy_avg": 0.49007368087768555, + "lr": 2.897622699386503e-06, + "objective/entropy": -148.64157104492188, + "objective/kl": 15.987449645996094, + "objective/non_score_reward": -1.5987448692321777, + "objective/rlhf_reward": -5.994979923963546, + "objective/scores": 0.1, + "policy/approxkl_avg": 167.39520263671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6771738529205322, + "step": 534, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.995469093322754 + }, + { + "episode": 8576, + "epoch": 0.15415033972031492, + "loss/policy_avg": 0.2475529909133911, + "lr": 2.897430981595092e-06, + "objective/entropy": 63.576107025146484, + "objective/kl": 8.813644409179688, + "objective/non_score_reward": -0.8813644051551819, + "objective/rlhf_reward": -3.125457620620727, + "objective/scores": 0.1, + "policy/approxkl_avg": 47.86650848388672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7202204465866089, + "step": 535, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9946643114089966 + }, + { + "episode": 8592, + "epoch": 0.15443793363770356, + "loss/policy_avg": 0.3695685565471649, + "lr": 2.897239263803681e-06, + "objective/entropy": 72.53046417236328, + "objective/kl": 15.155887603759766, + "objective/non_score_reward": -1.5155887603759766, + "objective/rlhf_reward": -1.6623550713062283, + "objective/scores": 1.1, + "policy/approxkl_avg": 99.77005004882812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.731490969657898, + "step": 536, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989895820617676 + }, + { + "episode": 8608, + "epoch": 0.1547255275550922, + "loss/policy_avg": 0.19564469158649445, + "lr": 2.89704754601227e-06, + "objective/entropy": 104.1103515625, + "objective/kl": 12.909381866455078, + "objective/non_score_reward": -1.290938377380371, + "objective/rlhf_reward": -4.7637532413005825, + "objective/scores": 0.1, + "policy/approxkl_avg": 49.42005920410156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4913232922554016, + "step": 537, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9961007833480835 + }, + { + "episode": 8624, + "epoch": 0.15501312147248086, + "loss/policy_avg": 0.0638544037938118, + "lr": 2.896855828220859e-06, + "objective/entropy": 120.96896362304688, + "objective/kl": 17.086078643798828, + "objective/non_score_reward": -1.708607792854309, + "objective/rlhf_reward": -6.434431171417236, + "objective/scores": 0.1, + "policy/approxkl_avg": 112.74113464355469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.625002920627594, + "step": 538, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973227977752686 + }, + { + "episode": 8640, + "epoch": 0.1553007153898695, + "loss/policy_avg": 0.4011816382408142, + "lr": 2.896664110429448e-06, + "objective/entropy": 95.19891357421875, + "objective/kl": 5.403826713562012, + "objective/non_score_reward": -0.5403826236724854, + "objective/rlhf_reward": -4.161530494689941, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.818931579589844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.44589221477508545, + "step": 539, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999190330505371 + }, + { + "episode": 8656, + "epoch": 0.15558830930725814, + "loss/policy_avg": 0.006004150491207838, + "lr": 2.896472392638037e-06, + "objective/entropy": -98.64430236816406, + "objective/kl": 13.903255462646484, + "objective/non_score_reward": -1.3903255462646484, + "objective/rlhf_reward": -3.4385958335557323, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 96.46614074707031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5993703603744507, + "step": 540, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983282089233398 + }, + { + "episode": 8672, + "epoch": 0.1558759032246468, + "loss/policy_avg": 0.25113996863365173, + "lr": 2.896280674846626e-06, + "objective/entropy": 28.177112579345703, + "objective/kl": 10.27774715423584, + "objective/non_score_reward": -1.027774691581726, + "objective/rlhf_reward": -3.711098855733871, + "objective/scores": 0.1, + "policy/approxkl_avg": 15.50900650024414, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.38805845379829407, + "step": 541, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975563287734985 + }, + { + "episode": 8688, + "epoch": 0.15616349714203545, + "loss/policy_avg": 0.22983065247535706, + "lr": 2.896088957055215e-06, + "objective/entropy": -43.48884963989258, + "objective/kl": 16.519594192504883, + "objective/non_score_reward": -1.6519595384597778, + "objective/rlhf_reward": -4.207838369905948, + "objective/scores": 0.6, + "policy/approxkl_avg": 145.91189575195312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4935316741466522, + "step": 542, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9969571828842163 + }, + { + "episode": 8704, + "epoch": 0.1564510910594241, + "loss/policy_avg": 0.5034677386283875, + "lr": 2.895897239263804e-06, + "objective/entropy": -180.35511779785156, + "objective/kl": 11.565031051635742, + "objective/non_score_reward": -1.1565032005310059, + "objective/rlhf_reward": -0.22601289153099025, + "objective/scores": 1.1, + "policy/approxkl_avg": 45.29411315917969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6911122798919678, + "step": 543, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9962868690490723 + }, + { + "episode": 8720, + "epoch": 0.15673868497681273, + "loss/policy_avg": 0.02647099643945694, + "lr": 2.895705521472393e-06, + "objective/entropy": -299.92938232421875, + "objective/kl": 12.21584701538086, + "objective/non_score_reward": -1.2215845584869385, + "objective/rlhf_reward": -4.486337954550981, + "objective/scores": 0.1, + "policy/approxkl_avg": 101.26480102539062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5777586698532104, + "step": 544, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0000030994415283 + }, + { + "episode": 8736, + "epoch": 0.1570262788942014, + "loss/policy_avg": 0.0650666356086731, + "lr": 2.8955138036809817e-06, + "objective/entropy": -25.684341430664062, + "objective/kl": 12.18275260925293, + "objective/non_score_reward": -1.2182753086090088, + "objective/rlhf_reward": -0.47310084700584376, + "objective/scores": 1.1, + "policy/approxkl_avg": 73.90670776367188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5378080606460571, + "step": 545, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9970163106918335 + }, + { + "episode": 8752, + "epoch": 0.15731387281159004, + "loss/policy_avg": 0.3169589042663574, + "lr": 2.8953220858895705e-06, + "objective/entropy": 100.88613891601562, + "objective/kl": 7.750062942504883, + "objective/non_score_reward": -0.7750062942504883, + "objective/rlhf_reward": -5.100025177001953, + "objective/scores": -0.5, + "policy/approxkl_avg": 76.69898986816406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7943099737167358, + "step": 546, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0006103515625 + }, + { + "episode": 8768, + "epoch": 0.15760146672897868, + "loss/policy_avg": 0.15454688668251038, + "lr": 2.8951303680981593e-06, + "objective/entropy": -160.29241943359375, + "objective/kl": 12.760615348815918, + "objective/non_score_reward": -1.2760615348815918, + "objective/rlhf_reward": -4.704246199131012, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.389159202575684, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6418617963790894, + "step": 547, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000383138656616 + }, + { + "episode": 8784, + "epoch": 0.15788906064636732, + "loss/policy_avg": -0.0003520110622048378, + "lr": 2.8949386503067486e-06, + "objective/entropy": 29.00008773803711, + "objective/kl": 12.984089851379395, + "objective/non_score_reward": -1.2984089851379395, + "objective/rlhf_reward": -2.2699168368589606, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.11572265625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.479004442691803, + "step": 548, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004520416259766 + }, + { + "episode": 8800, + "epoch": 0.158176654563756, + "loss/policy_avg": -0.15695317089557648, + "lr": 2.8947469325153374e-06, + "objective/entropy": -84.50102233886719, + "objective/kl": 18.578369140625, + "objective/non_score_reward": -1.8578369617462158, + "objective/rlhf_reward": -5.698014610509077, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 86.01622009277344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4987892508506775, + "step": 549, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977936744689941 + }, + { + "episode": 8816, + "epoch": 0.15846424848114463, + "loss/policy_avg": -0.01714038848876953, + "lr": 2.894555214723926e-06, + "objective/entropy": -128.65835571289062, + "objective/kl": 5.4775614738464355, + "objective/non_score_reward": -0.5477561950683594, + "objective/rlhf_reward": -4.1910247802734375, + "objective/scores": -0.5, + "policy/approxkl_avg": 34.354248046875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5765225291252136, + "step": 550, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985828399658203 + }, + { + "episode": 8832, + "epoch": 0.15875184239853327, + "loss/policy_avg": 0.19021713733673096, + "lr": 2.8943634969325154e-06, + "objective/entropy": -61.83000946044922, + "objective/kl": 10.497882843017578, + "objective/non_score_reward": -1.0497883558273315, + "objective/rlhf_reward": -3.7991533041000363, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.45492553710938, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4981521964073181, + "step": 551, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998138666152954 + }, + { + "episode": 8848, + "epoch": 0.1590394363159219, + "loss/policy_avg": 0.06787601113319397, + "lr": 2.8941717791411042e-06, + "objective/entropy": 38.238014221191406, + "objective/kl": 15.474870681762695, + "objective/non_score_reward": -1.5474871397018433, + "objective/rlhf_reward": -8.189949035644531, + "objective/scores": -0.5, + "policy/approxkl_avg": 82.42859649658203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5804091095924377, + "step": 552, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993863105773926 + }, + { + "episode": 8864, + "epoch": 0.15932703023331057, + "loss/policy_avg": 0.30045267939567566, + "lr": 2.8939800613496935e-06, + "objective/entropy": -27.927902221679688, + "objective/kl": 10.419059753417969, + "objective/non_score_reward": -1.0419061183929443, + "objective/rlhf_reward": 0.23237573504447973, + "objective/scores": 1.1, + "policy/approxkl_avg": 209.59939575195312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6831308603286743, + "step": 553, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001933574676514 + }, + { + "episode": 8880, + "epoch": 0.1596146241506992, + "loss/policy_avg": 0.2085457146167755, + "lr": 2.8937883435582823e-06, + "objective/entropy": 35.96397018432617, + "objective/kl": 9.092164993286133, + "objective/non_score_reward": -0.9092164635658264, + "objective/rlhf_reward": -1.9035325507322947, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 39.68387222290039, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5189159512519836, + "step": 554, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000199794769287 + }, + { + "episode": 8896, + "epoch": 0.15990221806808785, + "loss/policy_avg": 0.3386860191822052, + "lr": 2.893596625766871e-06, + "objective/entropy": 12.668701171875, + "objective/kl": 14.887137413024902, + "objective/non_score_reward": -1.4887138605117798, + "objective/rlhf_reward": -5.554855218529701, + "objective/scores": 0.1, + "policy/approxkl_avg": 57.799781799316406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8298596143722534, + "step": 555, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001885414123535 + }, + { + "episode": 8912, + "epoch": 0.1601898119854765, + "loss/policy_avg": 0.2826036214828491, + "lr": 2.8934049079754603e-06, + "objective/entropy": 291.82574462890625, + "objective/kl": 11.674125671386719, + "objective/non_score_reward": -1.167412519454956, + "objective/rlhf_reward": -6.669650077819824, + "objective/scores": -0.5, + "policy/approxkl_avg": 129.4175567626953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7528454065322876, + "step": 556, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999050498008728 + }, + { + "episode": 8928, + "epoch": 0.16047740590286516, + "loss/policy_avg": 0.10318401455879211, + "lr": 2.893213190184049e-06, + "objective/entropy": 120.07017517089844, + "objective/kl": 4.665571212768555, + "objective/non_score_reward": -0.46655717492103577, + "objective/rlhf_reward": -1.4662286698818208, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.405817031860352, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5051665306091309, + "step": 557, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997723937034607 + }, + { + "episode": 8944, + "epoch": 0.1607649998202538, + "loss/policy_avg": -0.29497459530830383, + "lr": 2.893021472392638e-06, + "objective/entropy": 37.394432067871094, + "objective/kl": 9.016752243041992, + "objective/non_score_reward": -0.9016750454902649, + "objective/rlhf_reward": -1.6592889232205705, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 60.83589172363281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6633663773536682, + "step": 558, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.003632068634033 + }, + { + "episode": 8960, + "epoch": 0.16105259373764244, + "loss/policy_avg": 0.27766960859298706, + "lr": 2.892829754601227e-06, + "objective/entropy": 162.43531799316406, + "objective/kl": 14.33592414855957, + "objective/non_score_reward": -1.433592438697815, + "objective/rlhf_reward": -2.8106508597147197, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 65.74765014648438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7771644592285156, + "step": 559, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998669147491455 + }, + { + "episode": 8976, + "epoch": 0.1613401876550311, + "loss/policy_avg": 0.10618109256029129, + "lr": 2.892638036809816e-06, + "objective/entropy": -214.1174774169922, + "objective/kl": 7.891653537750244, + "objective/non_score_reward": -0.7891653776168823, + "objective/rlhf_reward": 1.2433385044336323, + "objective/scores": 1.1, + "policy/approxkl_avg": 46.754520416259766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6963553428649902, + "step": 560, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9995391368865967 + }, + { + "episode": 8992, + "epoch": 0.16162778157241975, + "loss/policy_avg": 0.16953638195991516, + "lr": 2.892446319018405e-06, + "objective/entropy": -72.53345489501953, + "objective/kl": 14.390440940856934, + "objective/non_score_reward": -1.4390441179275513, + "objective/rlhf_reward": -5.356176471710205, + "objective/scores": 0.1, + "policy/approxkl_avg": 157.7322235107422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4842403531074524, + "step": 561, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982311725616455 + }, + { + "episode": 9008, + "epoch": 0.1619153754898084, + "loss/policy_avg": -0.21345758438110352, + "lr": 2.892254601226994e-06, + "objective/entropy": 70.15734100341797, + "objective/kl": 8.587663650512695, + "objective/non_score_reward": -0.8587663769721985, + "objective/rlhf_reward": -5.435065269470215, + "objective/scores": -0.5, + "policy/approxkl_avg": 48.32007598876953, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6229357719421387, + "step": 562, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0034308433532715 + }, + { + "episode": 9024, + "epoch": 0.16220296940719703, + "loss/policy_avg": 0.628226637840271, + "lr": 2.892062883435583e-06, + "objective/entropy": 71.88837432861328, + "objective/kl": 13.11463737487793, + "objective/non_score_reward": -1.3114639520645142, + "objective/rlhf_reward": -7.245855331420898, + "objective/scores": -0.5, + "policy/approxkl_avg": 86.9578628540039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6590292453765869, + "step": 563, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999263048171997 + }, + { + "episode": 9040, + "epoch": 0.1624905633245857, + "loss/policy_avg": 0.4895268380641937, + "lr": 2.891871165644172e-06, + "objective/entropy": -137.79327392578125, + "objective/kl": 11.846264839172363, + "objective/non_score_reward": -1.1846263408660889, + "objective/rlhf_reward": -6.7385053634643555, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.822731018066406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6190036535263062, + "step": 564, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992012977600098 + }, + { + "episode": 9056, + "epoch": 0.16277815724197434, + "loss/policy_avg": 0.02343292161822319, + "lr": 2.891679447852761e-06, + "objective/entropy": -80.07128143310547, + "objective/kl": 8.20994758605957, + "objective/non_score_reward": -0.8209947943687439, + "objective/rlhf_reward": -2.8839792668819424, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.4821832180023193, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7127578258514404, + "step": 565, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.004193067550659 + }, + { + "episode": 9072, + "epoch": 0.16306575115936298, + "loss/policy_avg": 0.0619128942489624, + "lr": 2.89148773006135e-06, + "objective/entropy": -55.37126922607422, + "objective/kl": 17.618186950683594, + "objective/non_score_reward": -1.7618186473846436, + "objective/rlhf_reward": -4.12355539643881, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 126.11732482910156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5896855592727661, + "step": 566, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002530097961426 + }, + { + "episode": 9088, + "epoch": 0.16335334507675162, + "loss/policy_avg": 0.33223459124565125, + "lr": 2.8912960122699385e-06, + "objective/entropy": 44.957420349121094, + "objective/kl": 12.796667098999023, + "objective/non_score_reward": -1.2796669006347656, + "objective/rlhf_reward": -7.1186676025390625, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.001811027526855, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4202909469604492, + "step": 567, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002686977386475 + }, + { + "episode": 9104, + "epoch": 0.16364093899414028, + "loss/policy_avg": 0.46875202655792236, + "lr": 2.8911042944785277e-06, + "objective/entropy": 75.29147338867188, + "objective/kl": 14.741684913635254, + "objective/non_score_reward": -1.4741685390472412, + "objective/rlhf_reward": -4.071845050129006, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 119.49514770507812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4648001790046692, + "step": 568, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998049259185791 + }, + { + "episode": 9120, + "epoch": 0.16392853291152892, + "loss/policy_avg": 0.037746116518974304, + "lr": 2.8909125766871165e-06, + "objective/entropy": -114.35173797607422, + "objective/kl": 17.751426696777344, + "objective/non_score_reward": -1.7751425504684448, + "objective/rlhf_reward": -4.176851276994917, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 63.35493850708008, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7318030595779419, + "step": 569, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995402097702026 + }, + { + "episode": 9136, + "epoch": 0.16421612682891756, + "loss/policy_avg": 0.12556898593902588, + "lr": 2.8907208588957053e-06, + "objective/entropy": 275.97406005859375, + "objective/kl": 16.65966796875, + "objective/non_score_reward": -1.6659668684005737, + "objective/rlhf_reward": -3.740148504019949, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 182.56800842285156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7940037250518799, + "step": 570, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998062252998352 + }, + { + "episode": 9152, + "epoch": 0.1645037207463062, + "loss/policy_avg": 0.19311276078224182, + "lr": 2.8905291411042946e-06, + "objective/entropy": -13.802238464355469, + "objective/kl": 11.889385223388672, + "objective/non_score_reward": -1.1889386177062988, + "objective/rlhf_reward": -0.3557543516159054, + "objective/scores": 1.1, + "policy/approxkl_avg": 168.97225952148438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5607097148895264, + "step": 571, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9966776371002197 + }, + { + "episode": 9168, + "epoch": 0.16479131466369487, + "loss/policy_avg": -0.0551675409078598, + "lr": 2.8903374233128834e-06, + "objective/entropy": 199.9774627685547, + "objective/kl": 10.514884948730469, + "objective/non_score_reward": -1.0514883995056152, + "objective/rlhf_reward": -3.8059536576271054, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.9141960144043, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5954399108886719, + "step": 572, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0013375282287598 + }, + { + "episode": 9184, + "epoch": 0.1650789085810835, + "loss/policy_avg": 0.5291420817375183, + "lr": 2.890145705521472e-06, + "objective/entropy": -87.00729370117188, + "objective/kl": 17.926801681518555, + "objective/non_score_reward": -1.792680263519287, + "objective/rlhf_reward": -5.048014911190544, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 174.02410888671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6229202747344971, + "step": 573, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971938133239746 + }, + { + "episode": 9200, + "epoch": 0.16536650249847215, + "loss/policy_avg": 0.026936478912830353, + "lr": 2.8899539877300614e-06, + "objective/entropy": 16.224300384521484, + "objective/kl": 12.667734146118164, + "objective/non_score_reward": -1.2667735815048218, + "objective/rlhf_reward": -0.667094303667545, + "objective/scores": 1.1, + "policy/approxkl_avg": 61.12175369262695, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6178743243217468, + "step": 574, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982073307037354 + }, + { + "episode": 9216, + "epoch": 0.1656540964158608, + "loss/policy_avg": 0.3519830107688904, + "lr": 2.8897622699386502e-06, + "objective/entropy": -118.76617431640625, + "objective/kl": 12.919654846191406, + "objective/non_score_reward": -1.2919654846191406, + "objective/rlhf_reward": -3.5060024909382927, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 151.7967529296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4337049126625061, + "step": 575, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996576189994812 + }, + { + "episode": 9232, + "epoch": 0.16594169033324946, + "loss/policy_avg": 0.10243120789527893, + "lr": 2.8895705521472395e-06, + "objective/entropy": -136.20498657226562, + "objective/kl": 8.501260757446289, + "objective/non_score_reward": -0.8501260280609131, + "objective/rlhf_reward": -1.8442450006871969, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 44.17273712158203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6579757928848267, + "step": 576, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9993293285369873 + }, + { + "episode": 9248, + "epoch": 0.1662292842506381, + "loss/policy_avg": 0.09091467410326004, + "lr": 2.8893788343558283e-06, + "objective/entropy": -303.64678955078125, + "objective/kl": 8.604415893554688, + "objective/non_score_reward": -0.8604416847229004, + "objective/rlhf_reward": -1.6169378116455784, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 31.75139045715332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7412898540496826, + "step": 577, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9997444152832031 + }, + { + "episode": 9264, + "epoch": 0.16651687816802674, + "loss/policy_avg": 0.21000587940216064, + "lr": 2.889187116564417e-06, + "objective/entropy": 239.27052307128906, + "objective/kl": 10.989913940429688, + "objective/non_score_reward": -1.0989913940429688, + "objective/rlhf_reward": 0.004034423828125355, + "objective/scores": 1.1, + "policy/approxkl_avg": 27.15020179748535, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5851229429244995, + "step": 578, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004048347473145 + }, + { + "episode": 9280, + "epoch": 0.1668044720854154, + "loss/policy_avg": 0.22401443123817444, + "lr": 2.8889953987730063e-06, + "objective/entropy": 83.70755004882812, + "objective/kl": 11.797503471374512, + "objective/non_score_reward": -1.1797504425048828, + "objective/rlhf_reward": -1.7952826365244117, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 76.76632690429688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6816673278808594, + "step": 579, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980626106262207 + }, + { + "episode": 9296, + "epoch": 0.16709206600280405, + "loss/policy_avg": -0.2864213287830353, + "lr": 2.888803680981595e-06, + "objective/entropy": -0.46259307861328125, + "objective/kl": 8.927458763122559, + "objective/non_score_reward": -0.8927459716796875, + "objective/rlhf_reward": 0.8290162324905399, + "objective/scores": 1.1, + "policy/approxkl_avg": 12.566313743591309, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4375641345977783, + "step": 580, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000452995300293 + }, + { + "episode": 9312, + "epoch": 0.16737965992019269, + "loss/policy_avg": 0.34181928634643555, + "lr": 2.8886119631901844e-06, + "objective/entropy": -10.957565307617188, + "objective/kl": 11.041678428649902, + "objective/non_score_reward": -1.1041678190231323, + "objective/rlhf_reward": -4.016671425104141, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.29110717773438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5227146148681641, + "step": 581, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998535394668579 + }, + { + "episode": 9328, + "epoch": 0.16766725383758133, + "loss/policy_avg": 0.7440632581710815, + "lr": 2.888420245398773e-06, + "objective/entropy": -169.97946166992188, + "objective/kl": 5.892565727233887, + "objective/non_score_reward": -0.5892565846443176, + "objective/rlhf_reward": -1.9570263683795928, + "objective/scores": 0.1, + "policy/approxkl_avg": 42.19253158569336, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6428844928741455, + "step": 582, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971792697906494 + }, + { + "episode": 9344, + "epoch": 0.16795484775497, + "loss/policy_avg": 0.039995964616537094, + "lr": 2.888228527607362e-06, + "objective/entropy": 11.100467681884766, + "objective/kl": 8.740276336669922, + "objective/non_score_reward": -0.874027669429779, + "objective/rlhf_reward": -3.0961106330156323, + "objective/scores": 0.1, + "policy/approxkl_avg": 68.66104125976562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5335364937782288, + "step": 583, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975062608718872 + }, + { + "episode": 9360, + "epoch": 0.16824244167235863, + "loss/policy_avg": 0.18156462907791138, + "lr": 2.8880368098159512e-06, + "objective/entropy": 146.76071166992188, + "objective/kl": 13.685348510742188, + "objective/non_score_reward": -1.368534803390503, + "objective/rlhf_reward": -2.5504204376947612, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 198.45315551757812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6299135684967041, + "step": 584, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996903896331787 + }, + { + "episode": 9376, + "epoch": 0.16853003558974727, + "loss/policy_avg": 0.8130825757980347, + "lr": 2.88784509202454e-06, + "objective/entropy": 111.27020263671875, + "objective/kl": 10.760910034179688, + "objective/non_score_reward": -1.076090931892395, + "objective/rlhf_reward": -3.9043638467788693, + "objective/scores": 0.1, + "policy/approxkl_avg": 103.25462341308594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5371990203857422, + "step": 585, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976208209991455 + }, + { + "episode": 9392, + "epoch": 0.1688176295071359, + "loss/policy_avg": 0.39816397428512573, + "lr": 2.887653374233129e-06, + "objective/entropy": -99.50511932373047, + "objective/kl": 9.237605094909668, + "objective/non_score_reward": -0.9237604737281799, + "objective/rlhf_reward": -5.695041656494141, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.84246063232422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5522249341011047, + "step": 586, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989943504333496 + }, + { + "episode": 9408, + "epoch": 0.16910522342452458, + "loss/policy_avg": 0.16980989277362823, + "lr": 2.887461656441718e-06, + "objective/entropy": -199.70681762695312, + "objective/kl": 11.351678848266602, + "objective/non_score_reward": -1.1351678371429443, + "objective/rlhf_reward": -2.1406713783740994, + "objective/scores": 0.6, + "policy/approxkl_avg": 99.28809356689453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6828469038009644, + "step": 587, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9995384216308594 + }, + { + "episode": 9424, + "epoch": 0.16939281734191322, + "loss/policy_avg": 0.4977327585220337, + "lr": 2.887269938650307e-06, + "objective/entropy": 92.04341888427734, + "objective/kl": 7.678049087524414, + "objective/non_score_reward": -0.7678048610687256, + "objective/rlhf_reward": -2.671219593286514, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.31591796875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6588965654373169, + "step": 588, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978811740875244 + }, + { + "episode": 9440, + "epoch": 0.16968041125930186, + "loss/policy_avg": 0.7905654311180115, + "lr": 2.887078220858896e-06, + "objective/entropy": 132.01498413085938, + "objective/kl": 14.539533615112305, + "objective/non_score_reward": -1.453953504562378, + "objective/rlhf_reward": -5.4158137202262875, + "objective/scores": 0.1, + "policy/approxkl_avg": 65.95307159423828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46912258863449097, + "step": 589, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991271495819092 + }, + { + "episode": 9456, + "epoch": 0.1699680051766905, + "loss/policy_avg": -0.066254623234272, + "lr": 2.8868865030674845e-06, + "objective/entropy": 71.34278869628906, + "objective/kl": 14.032512664794922, + "objective/non_score_reward": -1.4032511711120605, + "objective/rlhf_reward": -7.613004684448242, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.8614501953125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6078362464904785, + "step": 590, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000101089477539 + }, + { + "episode": 9472, + "epoch": 0.17025559909407917, + "loss/policy_avg": 0.5789448618888855, + "lr": 2.8866947852760737e-06, + "objective/entropy": 48.365203857421875, + "objective/kl": 10.217412948608398, + "objective/non_score_reward": -1.0217413902282715, + "objective/rlhf_reward": -3.6869654193520542, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.687175750732422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48520535230636597, + "step": 591, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006895065307617 + }, + { + "episode": 9488, + "epoch": 0.1705431930114678, + "loss/policy_avg": 0.4344612956047058, + "lr": 2.8865030674846625e-06, + "objective/entropy": 252.8432159423828, + "objective/kl": 17.551427841186523, + "objective/non_score_reward": -1.7551428079605103, + "objective/rlhf_reward": -9.020570755004883, + "objective/scores": -0.5, + "policy/approxkl_avg": 154.91946411132812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7674171924591064, + "step": 592, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977104663848877 + }, + { + "episode": 9504, + "epoch": 0.17083078692885645, + "loss/policy_avg": 0.45362526178359985, + "lr": 2.8863113496932513e-06, + "objective/entropy": 19.733963012695312, + "objective/kl": 12.605318069458008, + "objective/non_score_reward": -1.2605319023132324, + "objective/rlhf_reward": -4.642127639055252, + "objective/scores": 0.1, + "policy/approxkl_avg": 236.24461364746094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7392984628677368, + "step": 593, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99709153175354 + }, + { + "episode": 9520, + "epoch": 0.1711183808462451, + "loss/policy_avg": 0.05706373229622841, + "lr": 2.8861196319018406e-06, + "objective/entropy": -80.88824462890625, + "objective/kl": 10.57326889038086, + "objective/non_score_reward": -1.0573269128799438, + "objective/rlhf_reward": 0.1706922888755802, + "objective/scores": 1.1, + "policy/approxkl_avg": 220.6209716796875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6385011672973633, + "step": 594, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.000190258026123 + }, + { + "episode": 9536, + "epoch": 0.17140597476363376, + "loss/policy_avg": 0.4798924922943115, + "lr": 2.8859279141104294e-06, + "objective/entropy": 96.04144287109375, + "objective/kl": 13.669404029846191, + "objective/non_score_reward": -1.3669404983520508, + "objective/rlhf_reward": -1.0677618443965908, + "objective/scores": 1.1, + "policy/approxkl_avg": 79.78167724609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7598651647567749, + "step": 595, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998369216918945 + }, + { + "episode": 9552, + "epoch": 0.1716935686810224, + "loss/policy_avg": 0.6355167627334595, + "lr": 2.8857361963190186e-06, + "objective/entropy": 64.89462280273438, + "objective/kl": 13.15363883972168, + "objective/non_score_reward": -1.315363883972168, + "objective/rlhf_reward": -4.861455595493316, + "objective/scores": 0.1, + "policy/approxkl_avg": 57.076480865478516, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.47427719831466675, + "step": 596, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995226860046387 + }, + { + "episode": 9568, + "epoch": 0.17198116259841104, + "loss/policy_avg": 0.10829915851354599, + "lr": 2.8855444785276074e-06, + "objective/entropy": 79.2706298828125, + "objective/kl": 16.379791259765625, + "objective/non_score_reward": -1.63797926902771, + "objective/rlhf_reward": -6.151916986703872, + "objective/scores": 0.1, + "policy/approxkl_avg": 103.64852142333984, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6097987294197083, + "step": 597, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987919330596924 + }, + { + "episode": 9584, + "epoch": 0.1722687565157997, + "loss/policy_avg": 0.13417291641235352, + "lr": 2.8853527607361962e-06, + "objective/entropy": -54.99534606933594, + "objective/kl": 13.775299072265625, + "objective/non_score_reward": -1.3775299787521362, + "objective/rlhf_reward": -3.110119885206222, + "objective/scores": 0.6, + "policy/approxkl_avg": 41.135406494140625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6685090661048889, + "step": 598, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994128942489624 + }, + { + "episode": 9600, + "epoch": 0.17255635043318834, + "loss/policy_avg": 0.2789982557296753, + "lr": 2.8851610429447855e-06, + "objective/entropy": 6.352031707763672, + "objective/kl": 15.038141250610352, + "objective/non_score_reward": -1.5038139820098877, + "objective/rlhf_reward": -3.61525604724884, + "objective/scores": 0.6, + "policy/approxkl_avg": 80.10298919677734, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47064271569252014, + "step": 599, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999215602874756 + }, + { + "episode": 9616, + "epoch": 0.17284394435057698, + "loss/policy_avg": 0.17868322134017944, + "lr": 2.8849693251533743e-06, + "objective/entropy": 100.41844940185547, + "objective/kl": 14.95844841003418, + "objective/non_score_reward": -1.495845079421997, + "objective/rlhf_reward": -3.8606740257897716, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 51.73733139038086, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8421609997749329, + "step": 600, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001194953918457 + }, + { + "episode": 9632, + "epoch": 0.17313153826796562, + "loss/policy_avg": 0.6214461326599121, + "lr": 2.884777607361963e-06, + "objective/entropy": 231.32444763183594, + "objective/kl": 14.899240493774414, + "objective/non_score_reward": -1.4899241924285889, + "objective/rlhf_reward": -3.035977606416914, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 157.85556030273438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5734395980834961, + "step": 601, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9957280158996582 + }, + { + "episode": 9648, + "epoch": 0.1734191321853543, + "loss/policy_avg": 0.16177789866924286, + "lr": 2.8845858895705523e-06, + "objective/entropy": -75.67822265625, + "objective/kl": 13.02596664428711, + "objective/non_score_reward": -1.3025965690612793, + "objective/rlhf_reward": -0.8103865742683407, + "objective/scores": 1.1, + "policy/approxkl_avg": 51.60423278808594, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7318694591522217, + "step": 602, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9996418952941895 + }, + { + "episode": 9664, + "epoch": 0.17370672610274293, + "loss/policy_avg": 0.22588306665420532, + "lr": 2.884394171779141e-06, + "objective/entropy": 132.67103576660156, + "objective/kl": 13.433195114135742, + "objective/non_score_reward": -1.3433196544647217, + "objective/rlhf_reward": -4.973278379440307, + "objective/scores": 0.1, + "policy/approxkl_avg": 99.59487915039062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6894341707229614, + "step": 603, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990489482879639 + }, + { + "episode": 9680, + "epoch": 0.17399432002013157, + "loss/policy_avg": 0.22847160696983337, + "lr": 2.8842024539877304e-06, + "objective/entropy": 122.22633361816406, + "objective/kl": 20.479631423950195, + "objective/non_score_reward": -2.0479629039764404, + "objective/rlhf_reward": -3.7918519139289852, + "objective/scores": 1.1, + "policy/approxkl_avg": 103.07848358154297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8397064805030823, + "step": 604, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971545934677124 + }, + { + "episode": 9696, + "epoch": 0.1742819139375202, + "loss/policy_avg": 0.397056519985199, + "lr": 2.884010736196319e-06, + "objective/entropy": -62.73558044433594, + "objective/kl": 10.932550430297852, + "objective/non_score_reward": -1.0932550430297852, + "objective/rlhf_reward": -3.9730204105377194, + "objective/scores": 0.1, + "policy/approxkl_avg": 28.36017417907715, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5593264102935791, + "step": 605, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984571933746338 + }, + { + "episode": 9712, + "epoch": 0.17456950785490888, + "loss/policy_avg": 0.28502804040908813, + "lr": 2.883819018404908e-06, + "objective/entropy": 72.55328369140625, + "objective/kl": 12.644740104675293, + "objective/non_score_reward": -1.2644741535186768, + "objective/rlhf_reward": -7.057896614074707, + "objective/scores": -0.5, + "policy/approxkl_avg": 103.56513214111328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49720489978790283, + "step": 606, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984281063079834 + }, + { + "episode": 9728, + "epoch": 0.17485710177229752, + "loss/policy_avg": 0.49002373218536377, + "lr": 2.8836273006134972e-06, + "objective/entropy": -66.95445251464844, + "objective/kl": 12.276761054992676, + "objective/non_score_reward": -1.2276760339736938, + "objective/rlhf_reward": -6.910704135894775, + "objective/scores": -0.5, + "policy/approxkl_avg": 128.71206665039062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5521280765533447, + "step": 607, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998512625694275 + }, + { + "episode": 9744, + "epoch": 0.17514469568968616, + "loss/policy_avg": 0.15580351650714874, + "lr": 2.883435582822086e-06, + "objective/entropy": -14.935211181640625, + "objective/kl": 12.201488494873047, + "objective/non_score_reward": -1.2201488018035889, + "objective/rlhf_reward": -4.480595326423645, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.03843688964844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6915831565856934, + "step": 608, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9991041421890259 + }, + { + "episode": 9760, + "epoch": 0.1754322896070748, + "loss/policy_avg": -0.07565896213054657, + "lr": 2.883243865030675e-06, + "objective/entropy": -222.00851440429688, + "objective/kl": 13.098295211791992, + "objective/non_score_reward": -1.3098294734954834, + "objective/rlhf_reward": -3.4144895031777134, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 23.588008880615234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4547193646430969, + "step": 609, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0002942085266113 + }, + { + "episode": 9776, + "epoch": 0.17571988352446347, + "loss/policy_avg": -0.02608119696378708, + "lr": 2.883052147239264e-06, + "objective/entropy": -214.93325805664062, + "objective/kl": 13.121513366699219, + "objective/non_score_reward": -1.3121511936187744, + "objective/rlhf_reward": -2.324885469616625, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.12300109863281, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7193573713302612, + "step": 610, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.001455307006836 + }, + { + "episode": 9792, + "epoch": 0.1760074774418521, + "loss/policy_avg": 0.05792073905467987, + "lr": 2.882860429447853e-06, + "objective/entropy": -38.28462600708008, + "objective/kl": 6.414361000061035, + "objective/non_score_reward": -0.6414362192153931, + "objective/rlhf_reward": -4.565744876861572, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.29863929748535, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7934014797210693, + "step": 611, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998834133148193 + }, + { + "episode": 9808, + "epoch": 0.17629507135924075, + "loss/policy_avg": 0.34763196110725403, + "lr": 2.8826687116564417e-06, + "objective/entropy": -187.356201171875, + "objective/kl": 14.694540977478027, + "objective/non_score_reward": -1.4694541692733765, + "objective/rlhf_reward": -7.877816200256348, + "objective/scores": -0.5, + "policy/approxkl_avg": 164.05078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48596644401550293, + "step": 612, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99922776222229 + }, + { + "episode": 9824, + "epoch": 0.17658266527662939, + "loss/policy_avg": 0.034425608813762665, + "lr": 2.8824769938650305e-06, + "objective/entropy": 30.521377563476562, + "objective/kl": 14.546671867370605, + "objective/non_score_reward": -1.4546672105789185, + "objective/rlhf_reward": -5.4186688423156735, + "objective/scores": 0.1, + "policy/approxkl_avg": 92.84369659423828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6131436824798584, + "step": 613, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000127077102661 + }, + { + "episode": 9840, + "epoch": 0.17687025919401805, + "loss/policy_avg": 0.6788812875747681, + "lr": 2.8822852760736197e-06, + "objective/entropy": -29.50526237487793, + "objective/kl": 11.87314224243164, + "objective/non_score_reward": -1.1873142719268799, + "objective/rlhf_reward": -1.8255377455961435, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 69.99320220947266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484935283660889, + "step": 614, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9981306791305542 + }, + { + "episode": 9856, + "epoch": 0.1771578531114067, + "loss/policy_avg": 0.018577001988887787, + "lr": 2.8820935582822085e-06, + "objective/entropy": 91.57209777832031, + "objective/kl": 15.141888618469238, + "objective/non_score_reward": -1.5141887664794922, + "objective/rlhf_reward": -8.056755065917969, + "objective/scores": -0.5, + "policy/approxkl_avg": 128.7979736328125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5703952312469482, + "step": 615, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988903999328613 + }, + { + "episode": 9872, + "epoch": 0.17744544702879533, + "loss/policy_avg": -0.041117969900369644, + "lr": 2.8819018404907974e-06, + "objective/entropy": 137.1066436767578, + "objective/kl": 6.767946243286133, + "objective/non_score_reward": -0.6767945885658264, + "objective/rlhf_reward": -2.30717841386795, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.107406616210938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6602547764778137, + "step": 616, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993090629577637 + }, + { + "episode": 9888, + "epoch": 0.177733040946184, + "loss/policy_avg": 0.3238484859466553, + "lr": 2.8817101226993866e-06, + "objective/entropy": -231.63035583496094, + "objective/kl": 9.579354286193848, + "objective/non_score_reward": -0.9579353928565979, + "objective/rlhf_reward": -5.8317413330078125, + "objective/scores": -0.5, + "policy/approxkl_avg": 95.74900817871094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.772737979888916, + "step": 617, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9988899230957031 + }, + { + "episode": 9904, + "epoch": 0.17802063486357264, + "loss/policy_avg": -0.08733166754245758, + "lr": 2.8815184049079754e-06, + "objective/entropy": 94.04405975341797, + "objective/kl": 10.622529983520508, + "objective/non_score_reward": -1.0622529983520508, + "objective/rlhf_reward": -1.849012157320976, + "objective/scores": 0.6, + "policy/approxkl_avg": 161.30787658691406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6008134484291077, + "step": 618, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983432292938232 + }, + { + "episode": 9920, + "epoch": 0.17830822878096128, + "loss/policy_avg": 0.22342449426651, + "lr": 2.8813266871165646e-06, + "objective/entropy": 23.68462562561035, + "objective/kl": 11.187488555908203, + "objective/non_score_reward": -1.1187489032745361, + "objective/rlhf_reward": -6.4749956130981445, + "objective/scores": -0.5, + "policy/approxkl_avg": 71.44534301757812, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.44539380073547363, + "step": 619, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987552165985107 + }, + { + "episode": 9936, + "epoch": 0.17859582269834992, + "loss/policy_avg": 0.22347092628479004, + "lr": 2.8811349693251534e-06, + "objective/entropy": 175.77520751953125, + "objective/kl": 17.442081451416016, + "objective/non_score_reward": -1.7442083358764648, + "objective/rlhf_reward": -8.97683334350586, + "objective/scores": -0.5, + "policy/approxkl_avg": 161.70484924316406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4676833748817444, + "step": 620, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993557929992676 + }, + { + "episode": 9952, + "epoch": 0.1788834166157386, + "loss/policy_avg": 0.09556691348552704, + "lr": 2.8809432515337422e-06, + "objective/entropy": 166.47305297851562, + "objective/kl": 14.810133934020996, + "objective/non_score_reward": -1.4810134172439575, + "objective/rlhf_reward": -5.524053907394409, + "objective/scores": 0.1, + "policy/approxkl_avg": 101.40066528320312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5872761011123657, + "step": 621, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000371217727661 + }, + { + "episode": 9968, + "epoch": 0.17917101053312723, + "loss/policy_avg": 0.30460673570632935, + "lr": 2.8807515337423315e-06, + "objective/entropy": -225.1236572265625, + "objective/kl": 13.543222427368164, + "objective/non_score_reward": -1.3543224334716797, + "objective/rlhf_reward": -1.0172893762588497, + "objective/scores": 1.1, + "policy/approxkl_avg": 82.92485046386719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5670063495635986, + "step": 622, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9980478286743164 + }, + { + "episode": 9984, + "epoch": 0.17945860445051587, + "loss/policy_avg": 0.2849801778793335, + "lr": 2.8805598159509203e-06, + "objective/entropy": 96.21595764160156, + "objective/kl": 12.93740463256836, + "objective/non_score_reward": -1.2937402725219727, + "objective/rlhf_reward": -4.774961328506469, + "objective/scores": 0.1, + "policy/approxkl_avg": 142.59600830078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47951966524124146, + "step": 623, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973121881484985 + }, + { + "episode": 10000, + "epoch": 0.1797461983679045, + "loss/policy_avg": 0.25585901737213135, + "lr": 2.880368098159509e-06, + "objective/entropy": 194.88865661621094, + "objective/kl": 6.153909206390381, + "objective/non_score_reward": -0.6153908967971802, + "objective/rlhf_reward": 0.4621554904270444, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 31.100088119506836, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7605673670768738, + "step": 624, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002366065979004 + }, + { + "episode": 10016, + "epoch": 0.18003379228529318, + "loss/policy_avg": 0.24915774166584015, + "lr": 2.8801763803680983e-06, + "objective/entropy": 120.67022705078125, + "objective/kl": 8.496511459350586, + "objective/non_score_reward": -0.8496510982513428, + "objective/rlhf_reward": -5.398604393005371, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.57280731201172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6938654184341431, + "step": 625, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988131523132324 + }, + { + "episode": 10032, + "epoch": 0.18032138620268182, + "loss/policy_avg": 0.4236345887184143, + "lr": 2.879984662576687e-06, + "objective/entropy": -12.667598724365234, + "objective/kl": 11.722315788269043, + "objective/non_score_reward": -1.1722315549850464, + "objective/rlhf_reward": -4.2889264062047, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.98892211914062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6948565244674683, + "step": 626, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976496696472168 + }, + { + "episode": 10048, + "epoch": 0.18060898012007046, + "loss/policy_avg": 0.16770553588867188, + "lr": 2.8797929447852764e-06, + "objective/entropy": 57.50523376464844, + "objective/kl": 17.840484619140625, + "objective/non_score_reward": -1.7840485572814941, + "objective/rlhf_reward": -9.136194229125977, + "objective/scores": -0.5, + "policy/approxkl_avg": 120.76194763183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6823735237121582, + "step": 627, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997957944869995 + }, + { + "episode": 10064, + "epoch": 0.1808965740374591, + "loss/policy_avg": 0.4961099624633789, + "lr": 2.879601226993865e-06, + "objective/entropy": -25.265766143798828, + "objective/kl": 10.27054214477539, + "objective/non_score_reward": -1.0270541906356812, + "objective/rlhf_reward": -1.1844979419719903, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 57.600494384765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6599663496017456, + "step": 628, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999619722366333 + }, + { + "episode": 10080, + "epoch": 0.18118416795484776, + "loss/policy_avg": 0.562252938747406, + "lr": 2.879409509202454e-06, + "objective/entropy": 54.366180419921875, + "objective/kl": 13.384576797485352, + "objective/non_score_reward": -1.3384575843811035, + "objective/rlhf_reward": -0.9538304716348645, + "objective/scores": 1.1, + "policy/approxkl_avg": 58.41983413696289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6028238534927368, + "step": 629, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995472431182861 + }, + { + "episode": 10096, + "epoch": 0.1814717618722364, + "loss/policy_avg": 0.9494496583938599, + "lr": 2.8792177914110432e-06, + "objective/entropy": 102.92730712890625, + "objective/kl": 11.970917701721191, + "objective/non_score_reward": -1.1970919370651245, + "objective/rlhf_reward": -3.055034414927164, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 71.0523910522461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5434826612472534, + "step": 630, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000587224960327 + }, + { + "episode": 10112, + "epoch": 0.18175935578962504, + "loss/policy_avg": 0.06726402044296265, + "lr": 2.879026073619632e-06, + "objective/entropy": -49.708221435546875, + "objective/kl": 11.118875503540039, + "objective/non_score_reward": -1.1118874549865723, + "objective/rlhf_reward": -1.5238312228929727, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 84.36143493652344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7672351002693176, + "step": 631, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989380836486816 + }, + { + "episode": 10128, + "epoch": 0.18204694970701368, + "loss/policy_avg": -0.2285957634449005, + "lr": 2.8788343558282213e-06, + "objective/entropy": 167.11105346679688, + "objective/kl": 11.83863353729248, + "objective/non_score_reward": -1.1838631629943848, + "objective/rlhf_reward": -3.1791934733658582, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 22.718273162841797, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.45381465554237366, + "step": 632, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0061585903167725 + }, + { + "episode": 10144, + "epoch": 0.18233454362440235, + "loss/policy_avg": 1.1582974195480347, + "lr": 2.87864263803681e-06, + "objective/entropy": -151.89218139648438, + "objective/kl": 7.8154096603393555, + "objective/non_score_reward": -0.7815409898757935, + "objective/rlhf_reward": -2.7261639595031735, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.87982177734375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6609333753585815, + "step": 633, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000443458557129 + }, + { + "episode": 10160, + "epoch": 0.182622137541791, + "loss/policy_avg": -0.023708324879407883, + "lr": 2.878450920245399e-06, + "objective/entropy": 37.85424041748047, + "objective/kl": 12.195389747619629, + "objective/non_score_reward": -1.2195390462875366, + "objective/rlhf_reward": -1.9544372900736064, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 84.37319946289062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.563186764717102, + "step": 634, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0016093254089355 + }, + { + "episode": 10176, + "epoch": 0.18290973145917963, + "loss/policy_avg": 0.22720938920974731, + "lr": 2.8782592024539877e-06, + "objective/entropy": 70.03634643554688, + "objective/kl": 14.060079574584961, + "objective/non_score_reward": -1.406008005142212, + "objective/rlhf_reward": -7.624032020568848, + "objective/scores": -0.5, + "policy/approxkl_avg": 70.89201354980469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.449720174074173, + "step": 635, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980741739273071 + }, + { + "episode": 10192, + "epoch": 0.1831973253765683, + "loss/policy_avg": 0.5389465689659119, + "lr": 2.8780674846625765e-06, + "objective/entropy": -199.75421142578125, + "objective/kl": 15.233451843261719, + "objective/non_score_reward": -1.5233452320098877, + "objective/rlhf_reward": -8.093381881713867, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.79878234863281, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.48563235998153687, + "step": 636, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998356819152832 + }, + { + "episode": 10208, + "epoch": 0.18348491929395694, + "loss/policy_avg": 0.37082988023757935, + "lr": 2.8778757668711657e-06, + "objective/entropy": 29.62700653076172, + "objective/kl": 10.318947792053223, + "objective/non_score_reward": -1.0318948030471802, + "objective/rlhf_reward": -1.2038601979028907, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 19.83915138244629, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5535821914672852, + "step": 637, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0009536743164062 + }, + { + "episode": 10224, + "epoch": 0.18377251321134558, + "loss/policy_avg": 0.01469859853386879, + "lr": 2.8776840490797546e-06, + "objective/entropy": 13.477630615234375, + "objective/kl": 14.343404769897461, + "objective/non_score_reward": -1.4343405961990356, + "objective/rlhf_reward": -5.337362265586853, + "objective/scores": 0.1, + "policy/approxkl_avg": 108.1371078491211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4337630867958069, + "step": 638, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978924989700317 + }, + { + "episode": 10240, + "epoch": 0.18406010712873422, + "loss/policy_avg": 0.3221081495285034, + "lr": 2.8774923312883434e-06, + "objective/entropy": 282.66595458984375, + "objective/kl": 11.63412857055664, + "objective/non_score_reward": -1.1634126901626587, + "objective/rlhf_reward": -0.2536508500576016, + "objective/scores": 1.1, + "policy/approxkl_avg": 111.42243957519531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7095741033554077, + "step": 639, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9970577955245972 + }, + { + "episode": 10256, + "epoch": 0.18434770104612289, + "loss/policy_avg": 0.08750347793102264, + "lr": 2.8773006134969326e-06, + "objective/entropy": -36.40140914916992, + "objective/kl": 12.267041206359863, + "objective/non_score_reward": -1.2267042398452759, + "objective/rlhf_reward": -6.9068169593811035, + "objective/scores": -0.5, + "policy/approxkl_avg": 62.82714080810547, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5733236074447632, + "step": 640, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000546932220459 + }, + { + "episode": 10272, + "epoch": 0.18463529496351153, + "loss/policy_avg": 0.3036690652370453, + "lr": 2.8771088957055214e-06, + "objective/entropy": 49.865753173828125, + "objective/kl": 11.792703628540039, + "objective/non_score_reward": -1.1792702674865723, + "objective/rlhf_reward": -4.317080801725387, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.753089904785156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.695482611656189, + "step": 641, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989582300186157 + }, + { + "episode": 10288, + "epoch": 0.18492288888090017, + "loss/policy_avg": 0.7012618780136108, + "lr": 2.8769171779141106e-06, + "objective/entropy": 92.8203125, + "objective/kl": 10.009756088256836, + "objective/non_score_reward": -1.0009756088256836, + "objective/rlhf_reward": -6.003902435302734, + "objective/scores": -0.5, + "policy/approxkl_avg": 160.71142578125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.45697641372680664, + "step": 642, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986211061477661 + }, + { + "episode": 10304, + "epoch": 0.1852104827982888, + "loss/policy_avg": 0.18139493465423584, + "lr": 2.8767254601226994e-06, + "objective/entropy": 35.44036865234375, + "objective/kl": 11.35489273071289, + "objective/non_score_reward": -1.1354892253875732, + "objective/rlhf_reward": -6.541956901550293, + "objective/scores": -0.5, + "policy/approxkl_avg": 73.70870971679688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6653156876564026, + "step": 643, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983882904052734 + }, + { + "episode": 10320, + "epoch": 0.18549807671567747, + "loss/policy_avg": 0.21259146928787231, + "lr": 2.8765337423312883e-06, + "objective/entropy": 65.79669189453125, + "objective/kl": 5.1166887283325195, + "objective/non_score_reward": -0.5116689205169678, + "objective/rlhf_reward": -1.6466757565736772, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.101611137390137, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44910210371017456, + "step": 644, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9965652227401733 + }, + { + "episode": 10336, + "epoch": 0.1857856706330661, + "loss/policy_avg": 1.021752953529358, + "lr": 2.8763420245398775e-06, + "objective/entropy": -34.64234161376953, + "objective/kl": 15.709405899047852, + "objective/non_score_reward": -1.5709404945373535, + "objective/rlhf_reward": -5.883762127161026, + "objective/scores": 0.1, + "policy/approxkl_avg": 142.4956817626953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5723711252212524, + "step": 645, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976788759231567 + }, + { + "episode": 10352, + "epoch": 0.18607326455045475, + "loss/policy_avg": 0.24777646362781525, + "lr": 2.8761503067484663e-06, + "objective/entropy": 77.03242492675781, + "objective/kl": 15.705501556396484, + "objective/non_score_reward": -1.5705503225326538, + "objective/rlhf_reward": -5.882201319932937, + "objective/scores": 0.1, + "policy/approxkl_avg": 104.79753875732422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619193434715271, + "step": 646, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973654747009277 + }, + { + "episode": 10368, + "epoch": 0.1863608584678434, + "loss/policy_avg": 0.05733855068683624, + "lr": 2.8759585889570555e-06, + "objective/entropy": 105.6750717163086, + "objective/kl": 13.765717506408691, + "objective/non_score_reward": -1.3765718936920166, + "objective/rlhf_reward": -5.1062874555587765, + "objective/scores": 0.1, + "policy/approxkl_avg": 96.42398834228516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.2780163288116455, + "step": 647, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983540773391724 + }, + { + "episode": 10384, + "epoch": 0.18664845238523206, + "loss/policy_avg": 0.4622950553894043, + "lr": 2.8757668711656443e-06, + "objective/entropy": -98.47879028320312, + "objective/kl": 7.556210517883301, + "objective/non_score_reward": -0.755621075630188, + "objective/rlhf_reward": -0.09876528823492192, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 68.89460754394531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7213411927223206, + "step": 648, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9966726303100586 + }, + { + "episode": 10400, + "epoch": 0.1869360463026207, + "loss/policy_avg": 0.8236960172653198, + "lr": 2.875575153374233e-06, + "objective/entropy": 18.49713897705078, + "objective/kl": 15.854425430297852, + "objective/non_score_reward": -1.5854425430297852, + "objective/rlhf_reward": -8.34177017211914, + "objective/scores": -0.5, + "policy/approxkl_avg": 67.23236083984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8861958980560303, + "step": 649, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0006723403930664 + }, + { + "episode": 10416, + "epoch": 0.18722364022000934, + "loss/policy_avg": -0.11324408650398254, + "lr": 2.8753834355828224e-06, + "objective/entropy": -102.02124786376953, + "objective/kl": 5.1298675537109375, + "objective/non_score_reward": -0.5129867792129517, + "objective/rlhf_reward": 0.8717719123351846, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 28.45832633972168, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4376002848148346, + "step": 650, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0088820457458496 + }, + { + "episode": 10432, + "epoch": 0.18751123413739798, + "loss/policy_avg": 0.2450694590806961, + "lr": 2.875191717791411e-06, + "objective/entropy": 33.388450622558594, + "objective/kl": 13.52882194519043, + "objective/non_score_reward": -1.3528821468353271, + "objective/rlhf_reward": -2.4878097816717357, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 11.315901756286621, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.41914981603622437, + "step": 651, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0015416145324707 + }, + { + "episode": 10448, + "epoch": 0.18779882805478665, + "loss/policy_avg": 0.009298861026763916, + "lr": 2.875e-06, + "objective/entropy": -16.950359344482422, + "objective/kl": 9.792383193969727, + "objective/non_score_reward": -0.9792382717132568, + "objective/rlhf_reward": 0.4830468982458118, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.477102279663086, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6265867948532104, + "step": 652, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000293016433716 + }, + { + "episode": 10464, + "epoch": 0.1880864219721753, + "loss/policy_avg": 0.4039332866668701, + "lr": 2.8748082822085892e-06, + "objective/entropy": 66.02500915527344, + "objective/kl": 8.322999954223633, + "objective/non_score_reward": -0.8322999477386475, + "objective/rlhf_reward": -5.32919979095459, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.993392944335938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6458040475845337, + "step": 653, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998199939727783 + }, + { + "episode": 10480, + "epoch": 0.18837401588956393, + "loss/policy_avg": 0.15707165002822876, + "lr": 2.874616564417178e-06, + "objective/entropy": 171.90771484375, + "objective/kl": 11.948579788208008, + "objective/non_score_reward": -1.1948580741882324, + "objective/rlhf_reward": -6.77943229675293, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.851991653442383, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6504145860671997, + "step": 654, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997503757476807 + }, + { + "episode": 10496, + "epoch": 0.1886616098069526, + "loss/policy_avg": 0.07920925319194794, + "lr": 2.8744248466257673e-06, + "objective/entropy": 61.45829391479492, + "objective/kl": 12.758872985839844, + "objective/non_score_reward": -1.275887370109558, + "objective/rlhf_reward": -3.1561381918954208, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 101.437744140625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8374680876731873, + "step": 655, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0019378662109375 + }, + { + "episode": 10512, + "epoch": 0.18894920372434124, + "loss/policy_avg": 0.555253267288208, + "lr": 2.8742331288343557e-06, + "objective/entropy": -175.50808715820312, + "objective/kl": 14.069064140319824, + "objective/non_score_reward": -1.4069066047668457, + "objective/rlhf_reward": -5.227626121044159, + "objective/scores": 0.1, + "policy/approxkl_avg": 81.80171203613281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.590570330619812, + "step": 656, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998897910118103 + }, + { + "episode": 10528, + "epoch": 0.18923679764172988, + "loss/policy_avg": 0.11733473837375641, + "lr": 2.874041411042945e-06, + "objective/entropy": 114.00068664550781, + "objective/kl": 18.550704956054688, + "objective/non_score_reward": -1.8550705909729004, + "objective/rlhf_reward": -7.020282661914825, + "objective/scores": 0.1, + "policy/approxkl_avg": 203.83114624023438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6477522850036621, + "step": 657, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994854927062988 + }, + { + "episode": 10544, + "epoch": 0.18952439155911852, + "loss/policy_avg": 0.15716442465782166, + "lr": 2.8738496932515337e-06, + "objective/entropy": 280.2091064453125, + "objective/kl": 15.730361938476562, + "objective/non_score_reward": -1.5730363130569458, + "objective/rlhf_reward": -8.292144775390625, + "objective/scores": -0.5, + "policy/approxkl_avg": 69.69169616699219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 1.0129367113113403, + "step": 658, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002072334289551 + }, + { + "episode": 10560, + "epoch": 0.18981198547650718, + "loss/policy_avg": 0.6117931604385376, + "lr": 2.8736579754601225e-06, + "objective/entropy": -17.964401245117188, + "objective/kl": 12.338603973388672, + "objective/non_score_reward": -1.2338604927062988, + "objective/rlhf_reward": -6.935441970825195, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.43081283569336, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6273235082626343, + "step": 659, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0019495487213135 + }, + { + "episode": 10576, + "epoch": 0.19009957939389582, + "loss/policy_avg": 0.1620130091905594, + "lr": 2.8734662576687117e-06, + "objective/entropy": -31.008544921875, + "objective/kl": 8.896329879760742, + "objective/non_score_reward": -0.8896329998970032, + "objective/rlhf_reward": -1.6111206514405567, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 8.579498291015625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8015791177749634, + "step": 660, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0010838508605957 + }, + { + "episode": 10592, + "epoch": 0.19038717331128446, + "loss/policy_avg": 0.30528926849365234, + "lr": 2.8732745398773006e-06, + "objective/entropy": -81.17588806152344, + "objective/kl": 12.422649383544922, + "objective/non_score_reward": -1.2422648668289185, + "objective/rlhf_reward": -2.569059437513351, + "objective/scores": 0.6, + "policy/approxkl_avg": 26.168109893798828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6268837451934814, + "step": 661, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000943899154663 + }, + { + "episode": 10608, + "epoch": 0.1906747672286731, + "loss/policy_avg": 0.21409085392951965, + "lr": 2.8730828220858894e-06, + "objective/entropy": 109.389404296875, + "objective/kl": 14.156780242919922, + "objective/non_score_reward": -1.415677785873413, + "objective/rlhf_reward": -1.2627113521099087, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.9066390991211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6278871297836304, + "step": 662, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998223781585693 + }, + { + "episode": 10624, + "epoch": 0.19096236114606177, + "loss/policy_avg": 0.4462759494781494, + "lr": 2.8728911042944786e-06, + "objective/entropy": 9.342247009277344, + "objective/kl": 15.820850372314453, + "objective/non_score_reward": -1.5820850133895874, + "objective/rlhf_reward": -4.205633642450843, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 166.1714630126953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4893154501914978, + "step": 663, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985361099243164 + }, + { + "episode": 10640, + "epoch": 0.1912499550634504, + "loss/policy_avg": 0.26638445258140564, + "lr": 2.8726993865030674e-06, + "objective/entropy": 177.13156127929688, + "objective/kl": 9.343158721923828, + "objective/non_score_reward": -0.9343159794807434, + "objective/rlhf_reward": -5.7372636795043945, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.281099319458008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6856767535209656, + "step": 664, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989508390426636 + }, + { + "episode": 10656, + "epoch": 0.19153754898083905, + "loss/policy_avg": 0.16023144125938416, + "lr": 2.8725076687116566e-06, + "objective/entropy": 78.05880737304688, + "objective/kl": 7.6621246337890625, + "objective/non_score_reward": -0.7662124633789062, + "objective/rlhf_reward": -2.664849868416786, + "objective/scores": 0.1, + "policy/approxkl_avg": 50.030677795410156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3968629837036133, + "step": 665, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0032896995544434 + }, + { + "episode": 10672, + "epoch": 0.1918251428982277, + "loss/policy_avg": 0.06800419837236404, + "lr": 2.8723159509202455e-06, + "objective/entropy": 59.63671875, + "objective/kl": 12.568862915039062, + "objective/non_score_reward": -1.25688636302948, + "objective/rlhf_reward": -7.02754545211792, + "objective/scores": -0.5, + "policy/approxkl_avg": 222.47463989257812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5737947821617126, + "step": 666, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0001986026763916 + }, + { + "episode": 10688, + "epoch": 0.19211273681561636, + "loss/policy_avg": 0.553016185760498, + "lr": 2.8721242331288343e-06, + "objective/entropy": 235.48336791992188, + "objective/kl": 10.159873962402344, + "objective/non_score_reward": -1.0159873962402344, + "objective/rlhf_reward": -2.3306164304415384, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 54.860572814941406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7995113134384155, + "step": 667, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0003960132598877 + }, + { + "episode": 10704, + "epoch": 0.192400330733005, + "loss/policy_avg": -0.27713173627853394, + "lr": 2.8719325153374235e-06, + "objective/entropy": 32.731258392333984, + "objective/kl": 14.078591346740723, + "objective/non_score_reward": -1.407859206199646, + "objective/rlhf_reward": -5.231436854600906, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.393226623535156, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6517434120178223, + "step": 668, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0037641525268555 + }, + { + "episode": 10720, + "epoch": 0.19268792465039364, + "loss/policy_avg": 0.40979307889938354, + "lr": 2.8717407975460123e-06, + "objective/entropy": -46.12900924682617, + "objective/kl": 10.303600311279297, + "objective/non_score_reward": -1.0303599834442139, + "objective/rlhf_reward": -3.7214399039745327, + "objective/scores": 0.1, + "policy/approxkl_avg": 37.29252624511719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6077979803085327, + "step": 669, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998400092124939 + }, + { + "episode": 10736, + "epoch": 0.19297551856778228, + "loss/policy_avg": 0.25220245122909546, + "lr": 2.8715490797546015e-06, + "objective/entropy": 105.48408508300781, + "objective/kl": 17.024581909179688, + "objective/non_score_reward": -1.702458143234253, + "objective/rlhf_reward": -4.409832602739334, + "objective/scores": 0.6, + "policy/approxkl_avg": 208.58787536621094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7229609489440918, + "step": 670, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999326467514038 + }, + { + "episode": 10752, + "epoch": 0.19326311248517095, + "loss/policy_avg": -0.009634248912334442, + "lr": 2.8713573619631903e-06, + "objective/entropy": -176.94802856445312, + "objective/kl": 12.598766326904297, + "objective/non_score_reward": -1.2598767280578613, + "objective/rlhf_reward": -4.639506882429123, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.82705307006836, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.590308666229248, + "step": 671, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989120960235596 + }, + { + "episode": 10768, + "epoch": 0.19355070640255959, + "loss/policy_avg": 0.18840017914772034, + "lr": 2.871165644171779e-06, + "objective/entropy": 130.62278747558594, + "objective/kl": 11.168953895568848, + "objective/non_score_reward": -1.1168954372406006, + "objective/rlhf_reward": -6.467581748962402, + "objective/scores": -0.5, + "policy/approxkl_avg": 15.816328048706055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6742215752601624, + "step": 672, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0026257038116455 + }, + { + "episode": 10784, + "epoch": 0.19383830031994823, + "loss/policy_avg": 0.23392519354820251, + "lr": 2.8709739263803684e-06, + "objective/entropy": 127.62983703613281, + "objective/kl": 7.803813934326172, + "objective/non_score_reward": -0.780381441116333, + "objective/rlhf_reward": -1.1741145131754236, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 32.454715728759766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6811597347259521, + "step": 673, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976636171340942 + }, + { + "episode": 10800, + "epoch": 0.1941258942373369, + "loss/policy_avg": -0.1319524347782135, + "lr": 2.870782208588957e-06, + "objective/entropy": -20.453079223632812, + "objective/kl": 13.015239715576172, + "objective/non_score_reward": -1.301524043083191, + "objective/rlhf_reward": -7.206096172332764, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.110284805297852, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7017459869384766, + "step": 674, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0026445388793945 + }, + { + "episode": 10816, + "epoch": 0.19441348815472553, + "loss/policy_avg": 0.04714903235435486, + "lr": 2.870590490797546e-06, + "objective/entropy": 303.12890625, + "objective/kl": 7.725805282592773, + "objective/non_score_reward": -0.7725805044174194, + "objective/rlhf_reward": -5.090322017669678, + "objective/scores": -0.5, + "policy/approxkl_avg": 25.820777893066406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7819321155548096, + "step": 675, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988234043121338 + }, + { + "episode": 10832, + "epoch": 0.19470108207211417, + "loss/policy_avg": 0.23574650287628174, + "lr": 2.8703987730061352e-06, + "objective/entropy": -194.20651245117188, + "objective/kl": 14.129228591918945, + "objective/non_score_reward": -1.412922978401184, + "objective/rlhf_reward": -1.2516918838024136, + "objective/scores": 1.1, + "policy/approxkl_avg": 34.932640075683594, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.635951817035675, + "step": 676, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9998043775558472 + }, + { + "episode": 10848, + "epoch": 0.1949886759895028, + "loss/policy_avg": 1.922278881072998, + "lr": 2.870207055214724e-06, + "objective/entropy": 202.7659454345703, + "objective/kl": 8.443321228027344, + "objective/non_score_reward": -0.8443321585655212, + "objective/rlhf_reward": -5.377328872680664, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.10383605957031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7711433172225952, + "step": 677, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003726482391357 + }, + { + "episode": 10864, + "epoch": 0.19527626990689148, + "loss/policy_avg": 0.08129671216011047, + "lr": 2.870015337423313e-06, + "objective/entropy": -35.23242950439453, + "objective/kl": 12.633974075317383, + "objective/non_score_reward": -1.263397455215454, + "objective/rlhf_reward": -7.053589820861816, + "objective/scores": -0.5, + "policy/approxkl_avg": 148.25845336914062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8183538913726807, + "step": 678, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973478317260742 + }, + { + "episode": 10880, + "epoch": 0.19556386382428012, + "loss/policy_avg": 0.07529361546039581, + "lr": 2.8698236196319017e-06, + "objective/entropy": -160.49822998046875, + "objective/kl": 14.277109146118164, + "objective/non_score_reward": -1.4277108907699585, + "objective/rlhf_reward": -7.710843086242676, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.69904327392578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6668201684951782, + "step": 679, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9965490102767944 + }, + { + "episode": 10896, + "epoch": 0.19585145774166876, + "loss/policy_avg": -0.24820567667484283, + "lr": 2.869631901840491e-06, + "objective/entropy": -144.34962463378906, + "objective/kl": 7.242027759552002, + "objective/non_score_reward": -0.7242028117179871, + "objective/rlhf_reward": -2.4968111276626583, + "objective/scores": 0.1, + "policy/approxkl_avg": 4.443586349487305, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.5139869451522827, + "step": 680, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0061938762664795 + }, + { + "episode": 10912, + "epoch": 0.1961390516590574, + "loss/policy_avg": -0.08409806340932846, + "lr": 2.8694401840490797e-06, + "objective/entropy": -54.80754852294922, + "objective/kl": 12.144088745117188, + "objective/non_score_reward": -1.2144087553024292, + "objective/rlhf_reward": -1.9339162155401437, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.676344156265259, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8483699560165405, + "step": 681, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0018930435180664 + }, + { + "episode": 10928, + "epoch": 0.19642664557644607, + "loss/policy_avg": 0.12138275802135468, + "lr": 2.8692484662576685e-06, + "objective/entropy": 70.32062530517578, + "objective/kl": 12.482439041137695, + "objective/non_score_reward": -1.248243808746338, + "objective/rlhf_reward": -4.592975145578384, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.664492130279541, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5509634017944336, + "step": 682, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0010645389556885 + }, + { + "episode": 10944, + "epoch": 0.1967142394938347, + "loss/policy_avg": 0.2954164147377014, + "lr": 2.8690567484662578e-06, + "objective/entropy": 46.32169723510742, + "objective/kl": 7.934720039367676, + "objective/non_score_reward": -0.7934720516204834, + "objective/rlhf_reward": 1.2261117786169056, + "objective/scores": 1.1, + "policy/approxkl_avg": 69.41313934326172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5250649452209473, + "step": 683, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001229763031006 + }, + { + "episode": 10960, + "epoch": 0.19700183341122335, + "loss/policy_avg": 0.33847346901893616, + "lr": 2.8688650306748466e-06, + "objective/entropy": 136.00357055664062, + "objective/kl": 11.347841262817383, + "objective/non_score_reward": -1.1347841024398804, + "objective/rlhf_reward": -0.13913632035255397, + "objective/scores": 1.1, + "policy/approxkl_avg": 39.6363525390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5623573064804077, + "step": 684, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99955153465271 + }, + { + "episode": 10976, + "epoch": 0.197289427328612, + "loss/policy_avg": 0.0871865451335907, + "lr": 2.868673312883436e-06, + "objective/entropy": -171.5851593017578, + "objective/kl": 11.16063404083252, + "objective/non_score_reward": -1.1160634756088257, + "objective/rlhf_reward": -4.064253827929496, + "objective/scores": 0.1, + "policy/approxkl_avg": 69.5920181274414, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.790228009223938, + "step": 685, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9968866109848022 + }, + { + "episode": 10992, + "epoch": 0.19757702124600066, + "loss/policy_avg": 0.17770184576511383, + "lr": 2.8684815950920246e-06, + "objective/entropy": 117.78619384765625, + "objective/kl": 14.742965698242188, + "objective/non_score_reward": -1.4742965698242188, + "objective/rlhf_reward": -5.497186398506164, + "objective/scores": 0.1, + "policy/approxkl_avg": 171.13133239746094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5882231593132019, + "step": 686, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990973472595215 + }, + { + "episode": 11008, + "epoch": 0.1978646151633893, + "loss/policy_avg": 0.4302918314933777, + "lr": 2.8682898773006134e-06, + "objective/entropy": 86.18273162841797, + "objective/kl": 11.427553176879883, + "objective/non_score_reward": -1.142755389213562, + "objective/rlhf_reward": -6.571021556854248, + "objective/scores": -0.5, + "policy/approxkl_avg": 177.87432861328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.468346506357193, + "step": 687, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976341724395752 + }, + { + "episode": 11024, + "epoch": 0.19815220908077794, + "loss/policy_avg": 0.49217331409454346, + "lr": 2.8680981595092026e-06, + "objective/entropy": -60.496864318847656, + "objective/kl": 19.168210983276367, + "objective/non_score_reward": -1.9168212413787842, + "objective/rlhf_reward": -5.54457879282621, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 153.42633056640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6640913486480713, + "step": 688, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000704765319824 + }, + { + "episode": 11040, + "epoch": 0.19843980299816658, + "loss/policy_avg": 0.5149247646331787, + "lr": 2.8679064417177915e-06, + "objective/entropy": 217.9249725341797, + "objective/kl": 10.10614013671875, + "objective/non_score_reward": -1.010614037513733, + "objective/rlhf_reward": 0.3575438499450687, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.427753448486328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5551241636276245, + "step": 689, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004658699035645 + }, + { + "episode": 11056, + "epoch": 0.19872739691555524, + "loss/policy_avg": 2.387340784072876, + "lr": 2.8677147239263803e-06, + "objective/entropy": 90.62098693847656, + "objective/kl": 13.577856063842773, + "objective/non_score_reward": -1.357785701751709, + "objective/rlhf_reward": -5.031142449378967, + "objective/scores": 0.1, + "policy/approxkl_avg": 76.07826232910156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5787378549575806, + "step": 690, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003950595855713 + }, + { + "episode": 11072, + "epoch": 0.19901499083294388, + "loss/policy_avg": 0.07591082900762558, + "lr": 2.8675230061349695e-06, + "objective/entropy": 192.91531372070312, + "objective/kl": 18.27234649658203, + "objective/non_score_reward": -1.8272345066070557, + "objective/rlhf_reward": -5.186231674925361, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 53.839683532714844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5381182432174683, + "step": 691, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999629259109497 + }, + { + "episode": 11088, + "epoch": 0.19930258475033252, + "loss/policy_avg": 0.24866509437561035, + "lr": 2.8673312883435583e-06, + "objective/entropy": 162.3503875732422, + "objective/kl": 18.564533233642578, + "objective/non_score_reward": -1.8564532995224, + "objective/rlhf_reward": -3.025813496112823, + "objective/scores": 1.1, + "policy/approxkl_avg": 120.78541564941406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5609779357910156, + "step": 692, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9961520433425903 + }, + { + "episode": 11104, + "epoch": 0.1995901786677212, + "loss/policy_avg": -0.0011049304157495499, + "lr": 2.8671395705521475e-06, + "objective/entropy": 96.88809967041016, + "objective/kl": 7.879084587097168, + "objective/non_score_reward": -0.7879084944725037, + "objective/rlhf_reward": -2.7516339480876923, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.773414611816406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.45891791582107544, + "step": 693, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0016512870788574 + }, + { + "episode": 11120, + "epoch": 0.19987777258510983, + "loss/policy_avg": 0.10084787011146545, + "lr": 2.8669478527607364e-06, + "objective/entropy": -41.20310974121094, + "objective/kl": 13.97619915008545, + "objective/non_score_reward": -1.3976198434829712, + "objective/rlhf_reward": -1.190479493141174, + "objective/scores": 1.1, + "policy/approxkl_avg": 52.299827575683594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5506526231765747, + "step": 694, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993311166763306 + }, + { + "episode": 11136, + "epoch": 0.20016536650249847, + "loss/policy_avg": 0.06375744938850403, + "lr": 2.866756134969325e-06, + "objective/entropy": -108.92198181152344, + "objective/kl": 9.946971893310547, + "objective/non_score_reward": -0.9946972131729126, + "objective/rlhf_reward": -5.97878885269165, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.416961193084717, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5260124802589417, + "step": 695, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992403984069824 + }, + { + "episode": 11152, + "epoch": 0.2004529604198871, + "loss/policy_avg": 0.08030396699905396, + "lr": 2.8665644171779144e-06, + "objective/entropy": 123.5355453491211, + "objective/kl": 15.836409568786621, + "objective/non_score_reward": -1.5836410522460938, + "objective/rlhf_reward": -8.334564208984375, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.76978302001953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6828103065490723, + "step": 696, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998550415039062 + }, + { + "episode": 11168, + "epoch": 0.20074055433727578, + "loss/policy_avg": 0.29151037335395813, + "lr": 2.866372699386503e-06, + "objective/entropy": -57.21668243408203, + "objective/kl": 15.339473724365234, + "objective/non_score_reward": -1.5339473485946655, + "objective/rlhf_reward": -1.7357893645763394, + "objective/scores": 1.1, + "policy/approxkl_avg": 91.99110412597656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5846511721611023, + "step": 697, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978073835372925 + }, + { + "episode": 11184, + "epoch": 0.20102814825466442, + "loss/policy_avg": 0.11156813055276871, + "lr": 2.8661809815950924e-06, + "objective/entropy": 4.086456298828125, + "objective/kl": 12.176067352294922, + "objective/non_score_reward": -1.2176066637039185, + "objective/rlhf_reward": -4.470426669716835, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.13668441772461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7282131910324097, + "step": 698, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0009164810180664 + }, + { + "episode": 11200, + "epoch": 0.20131574217205306, + "loss/policy_avg": 0.30346500873565674, + "lr": 2.8659892638036812e-06, + "objective/entropy": -137.93194580078125, + "objective/kl": 9.752511978149414, + "objective/non_score_reward": -0.975251317024231, + "objective/rlhf_reward": -3.501005089282989, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.240784645080566, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4937957525253296, + "step": 699, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995919466018677 + }, + { + "episode": 11216, + "epoch": 0.2016033360894417, + "loss/policy_avg": 0.2282610535621643, + "lr": 2.86579754601227e-06, + "objective/entropy": 57.311073303222656, + "objective/kl": 14.862542152404785, + "objective/non_score_reward": -1.486254334449768, + "objective/rlhf_reward": -4.120188410553049, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 18.98688507080078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6022450923919678, + "step": 700, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996930360794067 + }, + { + "episode": 11232, + "epoch": 0.20189093000683037, + "loss/policy_avg": 0.3324587941169739, + "lr": 2.865605828220859e-06, + "objective/entropy": 49.462989807128906, + "objective/kl": 7.709723472595215, + "objective/non_score_reward": -0.7709723711013794, + "objective/rlhf_reward": -5.083889484405518, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.05154800415039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41537797451019287, + "step": 701, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002284049987793 + }, + { + "episode": 11248, + "epoch": 0.202178523924219, + "loss/policy_avg": 0.16259673237800598, + "lr": 2.8654141104294477e-06, + "objective/entropy": 60.31419372558594, + "objective/kl": 13.977958679199219, + "objective/non_score_reward": -1.3977960348129272, + "objective/rlhf_reward": -5.191184258460998, + "objective/scores": 0.1, + "policy/approxkl_avg": 202.6460418701172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7732006311416626, + "step": 702, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978877305984497 + }, + { + "episode": 11264, + "epoch": 0.20246611784160765, + "loss/policy_avg": 0.2512626051902771, + "lr": 2.865222392638037e-06, + "objective/entropy": 156.12353515625, + "objective/kl": 16.06157112121582, + "objective/non_score_reward": -1.6061570644378662, + "objective/rlhf_reward": -3.50090917641041, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 191.80953979492188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6488738059997559, + "step": 703, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996211528778076 + }, + { + "episode": 11280, + "epoch": 0.20275371175899629, + "loss/policy_avg": 0.15953761339187622, + "lr": 2.8650306748466257e-06, + "objective/entropy": -43.2182731628418, + "objective/kl": 16.940393447875977, + "objective/non_score_reward": -1.6940394639968872, + "objective/rlhf_reward": -2.3761578559875485, + "objective/scores": 1.1, + "policy/approxkl_avg": 112.27688598632812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.45469629764556885, + "step": 704, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988117218017578 + }, + { + "episode": 11296, + "epoch": 0.20304130567638495, + "loss/policy_avg": 0.6711443066596985, + "lr": 2.8648389570552145e-06, + "objective/entropy": -33.72349548339844, + "objective/kl": 11.539321899414062, + "objective/non_score_reward": -1.1539320945739746, + "objective/rlhf_reward": -2.8823953429857885, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 61.21073913574219, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5703375339508057, + "step": 705, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978996515274048 + }, + { + "episode": 11312, + "epoch": 0.2033288995937736, + "loss/policy_avg": 0.7480028867721558, + "lr": 2.8646472392638038e-06, + "objective/entropy": 220.5673828125, + "objective/kl": 7.203307151794434, + "objective/non_score_reward": -0.7203306555747986, + "objective/rlhf_reward": -1.2772026694455916, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 139.5849609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.786684513092041, + "step": 706, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992549419403076 + }, + { + "episode": 11328, + "epoch": 0.20361649351116223, + "loss/policy_avg": 0.00046522170305252075, + "lr": 2.8644555214723926e-06, + "objective/entropy": 77.21452331542969, + "objective/kl": 15.027108192443848, + "objective/non_score_reward": -1.5027105808258057, + "objective/rlhf_reward": -8.010842323303223, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.50825500488281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7454473972320557, + "step": 707, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993491172790527 + }, + { + "episode": 11344, + "epoch": 0.20390408742855087, + "loss/policy_avg": 0.4308600425720215, + "lr": 2.864263803680982e-06, + "objective/entropy": 105.22822570800781, + "objective/kl": 10.61517333984375, + "objective/non_score_reward": -1.0615174770355225, + "objective/rlhf_reward": -1.3223507746469703, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 15.070667266845703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5496830940246582, + "step": 708, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998878836631775 + }, + { + "episode": 11360, + "epoch": 0.20419168134593954, + "loss/policy_avg": 0.02560766041278839, + "lr": 2.8640720858895706e-06, + "objective/entropy": 52.49213790893555, + "objective/kl": 12.071894645690918, + "objective/non_score_reward": -1.2071894407272339, + "objective/rlhf_reward": -1.9050386294138162, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 22.58022689819336, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.678420901298523, + "step": 709, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0016746520996094 + }, + { + "episode": 11376, + "epoch": 0.20447927526332818, + "loss/policy_avg": 0.14677026867866516, + "lr": 2.8638803680981594e-06, + "objective/entropy": -44.507568359375, + "objective/kl": 10.297760009765625, + "objective/non_score_reward": -1.0297759771347046, + "objective/rlhf_reward": -6.119103908538818, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.161418914794922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.69853675365448, + "step": 710, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988245964050293 + }, + { + "episode": 11392, + "epoch": 0.20476686918071682, + "loss/policy_avg": 0.3421841859817505, + "lr": 2.8636886503067487e-06, + "objective/entropy": 1.61920166015625, + "objective/kl": 10.758232116699219, + "objective/non_score_reward": -1.0758233070373535, + "objective/rlhf_reward": -6.303293228149414, + "objective/scores": -0.5, + "policy/approxkl_avg": 68.81875610351562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.852238118648529, + "step": 711, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968268871307373 + }, + { + "episode": 11408, + "epoch": 0.2050544630981055, + "loss/policy_avg": 0.48776674270629883, + "lr": 2.8634969325153375e-06, + "objective/entropy": -14.844409942626953, + "objective/kl": 10.682808876037598, + "objective/non_score_reward": -1.0682809352874756, + "objective/rlhf_reward": -3.873123502731323, + "objective/scores": 0.1, + "policy/approxkl_avg": 63.03434371948242, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7068368196487427, + "step": 712, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9972708225250244 + }, + { + "episode": 11424, + "epoch": 0.20534205701549413, + "loss/policy_avg": 0.8941645622253418, + "lr": 2.8633052147239263e-06, + "objective/entropy": -93.52375030517578, + "objective/kl": 6.7023420333862305, + "objective/non_score_reward": -0.6702341437339783, + "objective/rlhf_reward": -4.680936813354492, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.55957794189453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7821812629699707, + "step": 713, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999279975891113 + }, + { + "episode": 11440, + "epoch": 0.20562965093288277, + "loss/policy_avg": 0.5883785486221313, + "lr": 2.8631134969325155e-06, + "objective/entropy": -66.35493469238281, + "objective/kl": 16.39910125732422, + "objective/non_score_reward": -1.6399102210998535, + "objective/rlhf_reward": -6.159640645980835, + "objective/scores": 0.1, + "policy/approxkl_avg": 99.94831848144531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5211118459701538, + "step": 714, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999578833580017 + }, + { + "episode": 11456, + "epoch": 0.2059172448502714, + "loss/policy_avg": 0.673638105392456, + "lr": 2.8629217791411043e-06, + "objective/entropy": 56.791744232177734, + "objective/kl": 13.02347183227539, + "objective/non_score_reward": -1.302347183227539, + "objective/rlhf_reward": -7.209388732910156, + "objective/scores": -0.5, + "policy/approxkl_avg": 57.524818420410156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6059812307357788, + "step": 715, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978859424591064 + }, + { + "episode": 11472, + "epoch": 0.20620483876766008, + "loss/policy_avg": 0.033942222595214844, + "lr": 2.8627300613496936e-06, + "objective/entropy": -125.00245666503906, + "objective/kl": 16.061996459960938, + "objective/non_score_reward": -1.6061995029449463, + "objective/rlhf_reward": -2.0247983470559117, + "objective/scores": 1.1, + "policy/approxkl_avg": 95.87913513183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49507004022598267, + "step": 716, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000415563583374 + }, + { + "episode": 11488, + "epoch": 0.20649243268504872, + "loss/policy_avg": 0.32607385516166687, + "lr": 2.8625383435582824e-06, + "objective/entropy": 165.55465698242188, + "objective/kl": 17.082618713378906, + "objective/non_score_reward": -1.7082619667053223, + "objective/rlhf_reward": -5.008219312104295, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 79.94935607910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5599914789199829, + "step": 717, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984850883483887 + }, + { + "episode": 11504, + "epoch": 0.20678002660243736, + "loss/policy_avg": 0.18473944067955017, + "lr": 2.862346625766871e-06, + "objective/entropy": 144.7602996826172, + "objective/kl": 13.379175186157227, + "objective/non_score_reward": -1.3379178047180176, + "objective/rlhf_reward": -4.951671248674392, + "objective/scores": 0.1, + "policy/approxkl_avg": 234.73941040039062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8746376037597656, + "step": 718, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992458820343018 + }, + { + "episode": 11520, + "epoch": 0.207067620519826, + "loss/policy_avg": 0.3600703775882721, + "lr": 2.8621549079754604e-06, + "objective/entropy": 63.41236877441406, + "objective/kl": 14.644229888916016, + "objective/non_score_reward": -1.4644229412078857, + "objective/rlhf_reward": -5.457692122459411, + "objective/scores": 0.1, + "policy/approxkl_avg": 130.00485229492188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.47121816873550415, + "step": 719, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991137981414795 + }, + { + "episode": 11536, + "epoch": 0.20735521443721466, + "loss/policy_avg": 0.23683911561965942, + "lr": 2.8619631901840492e-06, + "objective/entropy": 5.525566101074219, + "objective/kl": 11.11286735534668, + "objective/non_score_reward": -1.1112868785858154, + "objective/rlhf_reward": -2.8888880302577764, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 63.8202018737793, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.42511242628097534, + "step": 720, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9963274002075195 + }, + { + "episode": 11552, + "epoch": 0.2076428083546033, + "loss/policy_avg": 0.6022622585296631, + "lr": 2.8617714723926384e-06, + "objective/entropy": -102.19918823242188, + "objective/kl": 15.452737808227539, + "objective/non_score_reward": -1.545273780822754, + "objective/rlhf_reward": -3.257376019598219, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 49.32636642456055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7975940704345703, + "step": 721, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9964704513549805 + }, + { + "episode": 11568, + "epoch": 0.20793040227199194, + "loss/policy_avg": 0.24385926127433777, + "lr": 2.8615797546012273e-06, + "objective/entropy": 32.55232238769531, + "objective/kl": 14.009422302246094, + "objective/non_score_reward": -1.400942325592041, + "objective/rlhf_reward": -3.2037693619728085, + "objective/scores": 0.6, + "policy/approxkl_avg": 153.17926025390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8219155073165894, + "step": 722, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0011699199676514 + }, + { + "episode": 11584, + "epoch": 0.20821799618938058, + "loss/policy_avg": 0.3180992007255554, + "lr": 2.861388036809816e-06, + "objective/entropy": 72.60052490234375, + "objective/kl": 15.588754653930664, + "objective/non_score_reward": -1.558875560760498, + "objective/rlhf_reward": -8.235502243041992, + "objective/scores": -0.5, + "policy/approxkl_avg": 123.99591064453125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7457439303398132, + "step": 723, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982542991638184 + }, + { + "episode": 11600, + "epoch": 0.20850559010676925, + "loss/policy_avg": 0.08571982383728027, + "lr": 2.861196319018405e-06, + "objective/entropy": 43.14987564086914, + "objective/kl": 17.108150482177734, + "objective/non_score_reward": -1.7108149528503418, + "objective/rlhf_reward": -2.4432600051164624, + "objective/scores": 1.1, + "policy/approxkl_avg": 57.58824920654297, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.49476999044418335, + "step": 724, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984831809997559 + }, + { + "episode": 11616, + "epoch": 0.2087931840241579, + "loss/policy_avg": 0.21315881609916687, + "lr": 2.8610046012269937e-06, + "objective/entropy": -128.3315887451172, + "objective/kl": 15.486307144165039, + "objective/non_score_reward": -1.548630714416504, + "objective/rlhf_reward": -1.7945227384567257, + "objective/scores": 1.1, + "policy/approxkl_avg": 213.23350524902344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5425740480422974, + "step": 725, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989008903503418 + }, + { + "episode": 11632, + "epoch": 0.20908077794154653, + "loss/policy_avg": 0.5638971328735352, + "lr": 2.860812883435583e-06, + "objective/entropy": 91.92890930175781, + "objective/kl": 10.46470832824707, + "objective/non_score_reward": -1.0464708805084229, + "objective/rlhf_reward": -3.7858832538127896, + "objective/scores": 0.1, + "policy/approxkl_avg": 65.73753356933594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7266730070114136, + "step": 726, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996936321258545 + }, + { + "episode": 11648, + "epoch": 0.20936837185893517, + "loss/policy_avg": 0.33472561836242676, + "lr": 2.8606211656441717e-06, + "objective/entropy": 194.0995330810547, + "objective/kl": 17.15127182006836, + "objective/non_score_reward": -1.7151273488998413, + "objective/rlhf_reward": -6.460509246587753, + "objective/scores": 0.1, + "policy/approxkl_avg": 155.51809692382812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7743821144104004, + "step": 727, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982187747955322 + }, + { + "episode": 11664, + "epoch": 0.20965596577632384, + "loss/policy_avg": 0.0969974622130394, + "lr": 2.8604294478527605e-06, + "objective/entropy": 252.77114868164062, + "objective/kl": 13.091619491577148, + "objective/non_score_reward": -1.3091620206832886, + "objective/rlhf_reward": -2.3129289641391964, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 98.46098327636719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9146468639373779, + "step": 728, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998947262763977 + }, + { + "episode": 11680, + "epoch": 0.20994355969371248, + "loss/policy_avg": 1.0030015707015991, + "lr": 2.8602377300613498e-06, + "objective/entropy": -29.64310073852539, + "objective/kl": 9.100172996520996, + "objective/non_score_reward": -0.9100174307823181, + "objective/rlhf_reward": -5.640069961547852, + "objective/scores": -0.5, + "policy/approxkl_avg": 66.61669921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6575506329536438, + "step": 729, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992115497589111 + }, + { + "episode": 11696, + "epoch": 0.21023115361110112, + "loss/policy_avg": 0.07021422684192657, + "lr": 2.8600460122699386e-06, + "objective/entropy": -20.677194595336914, + "objective/kl": 11.13044548034668, + "objective/non_score_reward": -1.1130445003509521, + "objective/rlhf_reward": -6.452178001403809, + "objective/scores": -0.5, + "policy/approxkl_avg": 20.34206771850586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4357107877731323, + "step": 730, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978418350219727 + }, + { + "episode": 11712, + "epoch": 0.21051874752848979, + "loss/policy_avg": 0.11832322180271149, + "lr": 2.859854294478528e-06, + "objective/entropy": -303.6877136230469, + "objective/kl": 14.444772720336914, + "objective/non_score_reward": -1.4444773197174072, + "objective/rlhf_reward": -5.377909517288208, + "objective/scores": 0.1, + "policy/approxkl_avg": 57.83880615234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7242922782897949, + "step": 731, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0005035400390625 + }, + { + "episode": 11728, + "epoch": 0.21080634144587843, + "loss/policy_avg": 0.2551548480987549, + "lr": 2.8596625766871166e-06, + "objective/entropy": 227.5770721435547, + "objective/kl": 18.140262603759766, + "objective/non_score_reward": -1.8140263557434082, + "objective/rlhf_reward": -4.332386408687803, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 55.3328857421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6282124519348145, + "step": 732, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000762939453125 + }, + { + "episode": 11744, + "epoch": 0.21109393536326707, + "loss/policy_avg": 0.02594660222530365, + "lr": 2.8594708588957054e-06, + "objective/entropy": 29.276161193847656, + "objective/kl": 18.943225860595703, + "objective/non_score_reward": -1.8943226337432861, + "objective/rlhf_reward": -9.577290534973145, + "objective/scores": -0.5, + "policy/approxkl_avg": 128.90655517578125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6192996501922607, + "step": 733, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997767448425293 + }, + { + "episode": 11760, + "epoch": 0.2113815292806557, + "loss/policy_avg": 0.03574896976351738, + "lr": 2.8592791411042947e-06, + "objective/entropy": -71.63544464111328, + "objective/kl": 11.780060768127441, + "objective/non_score_reward": -1.1780060529708862, + "objective/rlhf_reward": -1.788305324257585, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 23.68338394165039, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6172910332679749, + "step": 734, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973077774047852 + }, + { + "episode": 11776, + "epoch": 0.21166912319804437, + "loss/policy_avg": 0.6160891056060791, + "lr": 2.8590874233128835e-06, + "objective/entropy": -90.55964660644531, + "objective/kl": 15.64774227142334, + "objective/non_score_reward": -1.5647742748260498, + "objective/rlhf_reward": -3.8590969800949093, + "objective/scores": 0.6, + "policy/approxkl_avg": 52.503440856933594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6182564496994019, + "step": 735, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989289045333862 + }, + { + "episode": 11792, + "epoch": 0.211956717115433, + "loss/policy_avg": 0.3826139569282532, + "lr": 2.8588957055214727e-06, + "objective/entropy": -80.17495727539062, + "objective/kl": 13.256806373596191, + "objective/non_score_reward": -1.3256807327270508, + "objective/rlhf_reward": -2.3790036782037944, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 100.29702758789062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7060404419898987, + "step": 736, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999138593673706 + }, + { + "episode": 11808, + "epoch": 0.21224431103282165, + "loss/policy_avg": -0.032949626445770264, + "lr": 2.8587039877300615e-06, + "objective/entropy": -176.73757934570312, + "objective/kl": 8.910408973693848, + "objective/non_score_reward": -0.8910409212112427, + "objective/rlhf_reward": -5.564163684844971, + "objective/scores": -0.5, + "policy/approxkl_avg": 92.01910400390625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.49018532037734985, + "step": 737, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0024096965789795 + }, + { + "episode": 11824, + "epoch": 0.2125319049502103, + "loss/policy_avg": 0.06443023681640625, + "lr": 2.8585122699386503e-06, + "objective/entropy": -30.911895751953125, + "objective/kl": 12.735221862792969, + "objective/non_score_reward": -1.2735222578048706, + "objective/rlhf_reward": -0.6940891504287716, + "objective/scores": 1.1, + "policy/approxkl_avg": 67.95742797851562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6443949341773987, + "step": 738, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994258880615234 + }, + { + "episode": 11840, + "epoch": 0.21281949886759896, + "loss/policy_avg": 0.8280965089797974, + "lr": 2.8583205521472396e-06, + "objective/entropy": -47.22200012207031, + "objective/kl": 11.071340560913086, + "objective/non_score_reward": -1.1071341037750244, + "objective/rlhf_reward": -6.428536415100098, + "objective/scores": -0.5, + "policy/approxkl_avg": 87.060302734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5341061353683472, + "step": 739, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975091218948364 + }, + { + "episode": 11856, + "epoch": 0.2131070927849876, + "loss/policy_avg": 0.17557145655155182, + "lr": 2.8581288343558284e-06, + "objective/entropy": -33.48394775390625, + "objective/kl": 12.476408004760742, + "objective/non_score_reward": -1.24764084815979, + "objective/rlhf_reward": -2.0668442777704925, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 54.93560791015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5069207549095154, + "step": 740, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968397617340088 + }, + { + "episode": 11872, + "epoch": 0.21339468670237624, + "loss/policy_avg": 0.07476645708084106, + "lr": 2.857937116564417e-06, + "objective/entropy": -147.0701904296875, + "objective/kl": 16.620323181152344, + "objective/non_score_reward": -1.6620323657989502, + "objective/rlhf_reward": -6.248129403591156, + "objective/scores": 0.1, + "policy/approxkl_avg": 201.01736450195312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.700377345085144, + "step": 741, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000122547149658 + }, + { + "episode": 11888, + "epoch": 0.21368228061976488, + "loss/policy_avg": 0.13703355193138123, + "lr": 2.8577453987730064e-06, + "objective/entropy": -27.77845001220703, + "objective/kl": 11.785709381103516, + "objective/non_score_reward": -1.1785709857940674, + "objective/rlhf_reward": -4.314283764362335, + "objective/scores": 0.1, + "policy/approxkl_avg": 119.8775863647461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6977899074554443, + "step": 742, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989674091339111 + }, + { + "episode": 11904, + "epoch": 0.21396987453715355, + "loss/policy_avg": 0.5084174871444702, + "lr": 2.8575536809815952e-06, + "objective/entropy": -178.72799682617188, + "objective/kl": 6.797292709350586, + "objective/non_score_reward": -0.6797292828559875, + "objective/rlhf_reward": -2.3189171761274334, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.976099014282227, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.889729380607605, + "step": 743, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9978809356689453 + }, + { + "episode": 11920, + "epoch": 0.2142574684545422, + "loss/policy_avg": 0.08717440068721771, + "lr": 2.8573619631901845e-06, + "objective/entropy": -100.47814178466797, + "objective/kl": 11.974678993225098, + "objective/non_score_reward": -1.1974678039550781, + "objective/rlhf_reward": -4.38987118601799, + "objective/scores": 0.1, + "policy/approxkl_avg": 116.80694580078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6132055521011353, + "step": 744, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9981837272644043 + }, + { + "episode": 11936, + "epoch": 0.21454506237193083, + "loss/policy_avg": 0.1440531611442566, + "lr": 2.857170245398773e-06, + "objective/entropy": -37.25544738769531, + "objective/kl": 12.166690826416016, + "objective/non_score_reward": -1.2166690826416016, + "objective/rlhf_reward": -1.9429572268736093, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 31.051591873168945, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7306356430053711, + "step": 745, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000868558883667 + }, + { + "episode": 11952, + "epoch": 0.21483265628931947, + "loss/policy_avg": 0.22620095312595367, + "lr": 2.856978527607362e-06, + "objective/entropy": -61.764095306396484, + "objective/kl": 12.740127563476562, + "objective/non_score_reward": -1.274012804031372, + "objective/rlhf_reward": -0.6960513353347775, + "objective/scores": 1.1, + "policy/approxkl_avg": 113.7810287475586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7009084224700928, + "step": 746, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981598854064941 + }, + { + "episode": 11968, + "epoch": 0.21512025020670814, + "loss/policy_avg": 0.14071348309516907, + "lr": 2.856786809815951e-06, + "objective/entropy": 29.96725845336914, + "objective/kl": 12.842681884765625, + "objective/non_score_reward": -1.2842683792114258, + "objective/rlhf_reward": -2.213354293943617, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.990078926086426, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4330785572528839, + "step": 747, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996414184570312 + }, + { + "episode": 11984, + "epoch": 0.21540784412409678, + "loss/policy_avg": 0.24370655417442322, + "lr": 2.8565950920245397e-06, + "objective/entropy": -147.6442108154297, + "objective/kl": 16.267515182495117, + "objective/non_score_reward": -1.6267515420913696, + "objective/rlhf_reward": -2.107005929946899, + "objective/scores": 1.1, + "policy/approxkl_avg": 191.24424743652344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5275372862815857, + "step": 748, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9969547986984253 + }, + { + "episode": 12000, + "epoch": 0.21569543804148542, + "loss/policy_avg": 0.16116216778755188, + "lr": 2.856403374233129e-06, + "objective/entropy": -21.843975067138672, + "objective/kl": 13.669893264770508, + "objective/non_score_reward": -1.3669893741607666, + "objective/rlhf_reward": -7.467957496643066, + "objective/scores": -0.5, + "policy/approxkl_avg": 52.672183990478516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9156934022903442, + "step": 749, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0002281665802 + }, + { + "episode": 12016, + "epoch": 0.21598303195887408, + "loss/policy_avg": 0.539941668510437, + "lr": 2.8562116564417177e-06, + "objective/entropy": 195.0397186279297, + "objective/kl": 7.716229438781738, + "objective/non_score_reward": -0.7716230154037476, + "objective/rlhf_reward": -5.086491584777832, + "objective/scores": -0.5, + "policy/approxkl_avg": 57.51429748535156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7705237865447998, + "step": 750, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006752014160156 + }, + { + "episode": 12032, + "epoch": 0.21627062587626272, + "loss/policy_avg": 0.711264967918396, + "lr": 2.8560199386503065e-06, + "objective/entropy": -299.9069519042969, + "objective/kl": 12.776535987854004, + "objective/non_score_reward": -1.277653694152832, + "objective/rlhf_reward": -4.710614657402038, + "objective/scores": 0.1, + "policy/approxkl_avg": 82.51051330566406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6128495335578918, + "step": 751, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 1.9975297451019287 + }, + { + "episode": 12048, + "epoch": 0.21655821979365136, + "loss/policy_avg": 0.4313165545463562, + "lr": 2.8558282208588958e-06, + "objective/entropy": 8.31052017211914, + "objective/kl": 17.218887329101562, + "objective/non_score_reward": -1.7218886613845825, + "objective/rlhf_reward": -3.9638356163513393, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 92.13627624511719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56125807762146, + "step": 752, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996793270111084 + }, + { + "episode": 12064, + "epoch": 0.21684581371104, + "loss/policy_avg": 0.2670312821865082, + "lr": 2.8556365030674846e-06, + "objective/entropy": 1.7406082153320312, + "objective/kl": 10.113969802856445, + "objective/non_score_reward": -1.011396884918213, + "objective/rlhf_reward": -2.0981764336692645, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 66.017822265625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6029417514801025, + "step": 753, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002827644348145 + }, + { + "episode": 12080, + "epoch": 0.21713340762842867, + "loss/policy_avg": 0.22361746430397034, + "lr": 2.855444785276074e-06, + "objective/entropy": -6.838325500488281, + "objective/kl": 9.408108711242676, + "objective/non_score_reward": -0.9408108592033386, + "objective/rlhf_reward": -3.363243496417999, + "objective/scores": 0.1, + "policy/approxkl_avg": 45.14617156982422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7355029582977295, + "step": 754, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000067949295044 + }, + { + "episode": 12096, + "epoch": 0.2174210015458173, + "loss/policy_avg": 0.4131731688976288, + "lr": 2.8552530674846626e-06, + "objective/entropy": -199.2462158203125, + "objective/kl": 11.438251495361328, + "objective/non_score_reward": -1.1438250541687012, + "objective/rlhf_reward": -6.575300216674805, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.276948928833008, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49213117361068726, + "step": 755, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0014610290527344 + }, + { + "episode": 12112, + "epoch": 0.21770859546320595, + "loss/policy_avg": 0.2563888430595398, + "lr": 2.8550613496932514e-06, + "objective/entropy": -50.35034942626953, + "objective/kl": 10.818819046020508, + "objective/non_score_reward": -1.0818817615509033, + "objective/rlhf_reward": -2.7234071231523327, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 5.181286811828613, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6795003414154053, + "step": 756, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993948936462402 + }, + { + "episode": 12128, + "epoch": 0.2179961893805946, + "loss/policy_avg": 0.27015259861946106, + "lr": 2.8548696319018407e-06, + "objective/entropy": 50.269439697265625, + "objective/kl": 11.087736129760742, + "objective/non_score_reward": -1.1087735891342163, + "objective/rlhf_reward": -0.035094296932220104, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.901422500610352, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6422553062438965, + "step": 757, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998644471168518 + }, + { + "episode": 12144, + "epoch": 0.21828378329798326, + "loss/policy_avg": 0.6295909881591797, + "lr": 2.8546779141104295e-06, + "objective/entropy": 40.25965118408203, + "objective/kl": 14.175272941589355, + "objective/non_score_reward": -1.417527198791504, + "objective/rlhf_reward": -2.7463900640022487, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 221.42971801757812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5903568267822266, + "step": 758, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998144268989563 + }, + { + "episode": 12160, + "epoch": 0.2185713772153719, + "loss/policy_avg": 0.6265522241592407, + "lr": 2.8544861963190187e-06, + "objective/entropy": -87.25729370117188, + "objective/kl": 10.555778503417969, + "objective/non_score_reward": -1.0555777549743652, + "objective/rlhf_reward": 0.17768906950950658, + "objective/scores": 1.1, + "policy/approxkl_avg": 90.87506866455078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6326598525047302, + "step": 759, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0008277893066406 + }, + { + "episode": 12176, + "epoch": 0.21885897113276054, + "loss/policy_avg": 0.12339673936367035, + "lr": 2.8542944785276075e-06, + "objective/entropy": -33.543636322021484, + "objective/kl": 7.962390899658203, + "objective/non_score_reward": -0.7962390184402466, + "objective/rlhf_reward": -2.784956073760986, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.08357810974121, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8025330901145935, + "step": 760, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9965462684631348 + }, + { + "episode": 12192, + "epoch": 0.21914656505014918, + "loss/policy_avg": 0.1615457832813263, + "lr": 2.8541027607361963e-06, + "objective/entropy": 145.47601318359375, + "objective/kl": 15.407791137695312, + "objective/non_score_reward": -1.5407792329788208, + "objective/rlhf_reward": -8.163117408752441, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.53536224365234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5605360269546509, + "step": 761, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999860525131226 + }, + { + "episode": 12208, + "epoch": 0.21943415896753785, + "loss/policy_avg": 0.30102694034576416, + "lr": 2.8539110429447856e-06, + "objective/entropy": 27.323680877685547, + "objective/kl": 3.660177707672119, + "objective/non_score_reward": -0.36601775884628296, + "objective/rlhf_reward": 2.935928934812546, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.197434902191162, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.36402425169944763, + "step": 762, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990768432617188 + }, + { + "episode": 12224, + "epoch": 0.21972175288492649, + "loss/policy_avg": 0.18315057456493378, + "lr": 2.8537193251533744e-06, + "objective/entropy": 72.46862030029297, + "objective/kl": 10.31401252746582, + "objective/non_score_reward": -1.0314011573791504, + "objective/rlhf_reward": -1.7256047189235686, + "objective/scores": 0.6, + "policy/approxkl_avg": 9.106523513793945, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7303919792175293, + "step": 763, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000040531158447 + }, + { + "episode": 12240, + "epoch": 0.22000934680231513, + "loss/policy_avg": 0.5514622330665588, + "lr": 2.853527607361963e-06, + "objective/entropy": -74.59797668457031, + "objective/kl": 14.129312515258789, + "objective/non_score_reward": -1.4129313230514526, + "objective/rlhf_reward": -1.2517252624034878, + "objective/scores": 1.1, + "policy/approxkl_avg": 46.52693176269531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4529426395893097, + "step": 764, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981811046600342 + }, + { + "episode": 12256, + "epoch": 0.22029694071970377, + "loss/policy_avg": -0.04296427220106125, + "lr": 2.8533358895705524e-06, + "objective/entropy": -14.170623779296875, + "objective/kl": 16.591388702392578, + "objective/non_score_reward": -1.6591390371322632, + "objective/rlhf_reward": -8.636556625366211, + "objective/scores": -0.5, + "policy/approxkl_avg": 108.84127807617188, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7804018259048462, + "step": 765, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0000200271606445 + }, + { + "episode": 12272, + "epoch": 0.22058453463709243, + "loss/policy_avg": 0.14860758185386658, + "lr": 2.8531441717791412e-06, + "objective/entropy": 234.34619140625, + "objective/kl": 14.954992294311523, + "objective/non_score_reward": -1.4954993724822998, + "objective/rlhf_reward": -3.8592914364495616, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 31.390544891357422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7560025453567505, + "step": 766, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960116147994995 + }, + { + "episode": 12288, + "epoch": 0.22087212855448107, + "loss/policy_avg": 0.027001656591892242, + "lr": 2.85295245398773e-06, + "objective/entropy": -231.42864990234375, + "objective/kl": 14.624351501464844, + "objective/non_score_reward": -1.462435245513916, + "objective/rlhf_reward": -1.449740996956825, + "objective/scores": 1.1, + "policy/approxkl_avg": 125.69221496582031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6178066730499268, + "step": 767, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983733892440796 + }, + { + "episode": 12304, + "epoch": 0.2211597224718697, + "loss/policy_avg": 0.10003480315208435, + "lr": 2.852760736196319e-06, + "objective/entropy": 105.91529846191406, + "objective/kl": 11.028611183166504, + "objective/non_score_reward": -1.1028611660003662, + "objective/rlhf_reward": -0.011445081233977916, + "objective/scores": 1.1, + "policy/approxkl_avg": 99.16251373291016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4474494457244873, + "step": 768, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001845359802246 + }, + { + "episode": 12320, + "epoch": 0.22144731638925838, + "loss/policy_avg": 0.3084249794483185, + "lr": 2.852569018404908e-06, + "objective/entropy": 311.5811767578125, + "objective/kl": 12.777912139892578, + "objective/non_score_reward": -1.2777912616729736, + "objective/rlhf_reward": -4.711165154725313, + "objective/scores": 0.1, + "policy/approxkl_avg": 87.19984436035156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.747369110584259, + "step": 769, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000682830810547 + }, + { + "episode": 12336, + "epoch": 0.22173491030664702, + "loss/policy_avg": 0.1643168181180954, + "lr": 2.852377300613497e-06, + "objective/entropy": -229.23110961914062, + "objective/kl": 13.623214721679688, + "objective/non_score_reward": -1.362321376800537, + "objective/rlhf_reward": -7.449285507202148, + "objective/scores": -0.5, + "policy/approxkl_avg": 68.4518051147461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7264796495437622, + "step": 770, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0006937980651855 + }, + { + "episode": 12352, + "epoch": 0.22202250422403566, + "loss/policy_avg": -0.010787129402160645, + "lr": 2.8521855828220857e-06, + "objective/entropy": 124.02545166015625, + "objective/kl": 7.704123497009277, + "objective/non_score_reward": -0.7704123258590698, + "objective/rlhf_reward": -5.081649303436279, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.008346557617188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6329702138900757, + "step": 771, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0013375282287598 + }, + { + "episode": 12368, + "epoch": 0.2223100981414243, + "loss/policy_avg": 0.2653355300426483, + "lr": 2.851993865030675e-06, + "objective/entropy": 23.56524658203125, + "objective/kl": 15.354362487792969, + "objective/non_score_reward": -1.5354361534118652, + "objective/rlhf_reward": -5.741744464635849, + "objective/scores": 0.1, + "policy/approxkl_avg": 111.05146789550781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6844107508659363, + "step": 772, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988980293273926 + }, + { + "episode": 12384, + "epoch": 0.22259769205881297, + "loss/policy_avg": 0.08450818061828613, + "lr": 2.8518021472392637e-06, + "objective/entropy": 342.4418029785156, + "objective/kl": 15.424835205078125, + "objective/non_score_reward": -1.5424836874008179, + "objective/rlhf_reward": -5.769934868812561, + "objective/scores": 0.1, + "policy/approxkl_avg": 132.2557373046875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8565220236778259, + "step": 773, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9969991445541382 + }, + { + "episode": 12400, + "epoch": 0.2228852859762016, + "loss/policy_avg": -0.5032927393913269, + "lr": 2.851610429447853e-06, + "objective/entropy": -107.81707763671875, + "objective/kl": 13.79594612121582, + "objective/non_score_reward": -1.3795948028564453, + "objective/rlhf_reward": -3.118379211425781, + "objective/scores": 0.6, + "policy/approxkl_avg": 41.49142837524414, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.35487908124923706, + "step": 774, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.006086826324463 + }, + { + "episode": 12416, + "epoch": 0.22317287989359025, + "loss/policy_avg": 0.14595381915569305, + "lr": 2.8514187116564418e-06, + "objective/entropy": -56.38682556152344, + "objective/kl": 13.150504112243652, + "objective/non_score_reward": -1.3150502443313599, + "objective/rlhf_reward": -3.3127899570035293, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 46.86200714111328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7194310426712036, + "step": 775, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001683235168457 + }, + { + "episode": 12432, + "epoch": 0.2234604738109789, + "loss/policy_avg": -0.2638096213340759, + "lr": 2.8512269938650306e-06, + "objective/entropy": 76.923095703125, + "objective/kl": 12.93875503540039, + "objective/non_score_reward": -1.2938756942749023, + "objective/rlhf_reward": -7.175502777099609, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.019851684570312, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5464330911636353, + "step": 776, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.014585018157959 + }, + { + "episode": 12448, + "epoch": 0.22374806772836756, + "loss/policy_avg": 0.19038286805152893, + "lr": 2.85103527607362e-06, + "objective/entropy": 99.66128540039062, + "objective/kl": 11.428020477294922, + "objective/non_score_reward": -1.142802119255066, + "objective/rlhf_reward": -4.1712084174156185, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.72539520263672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.533401608467102, + "step": 777, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9978110790252686 + }, + { + "episode": 12464, + "epoch": 0.2240356616457562, + "loss/policy_avg": 0.31564778089523315, + "lr": 2.8508435582822086e-06, + "objective/entropy": 43.836891174316406, + "objective/kl": 15.314764022827148, + "objective/non_score_reward": -1.5314764976501465, + "objective/rlhf_reward": -1.7259061098098751, + "objective/scores": 1.1, + "policy/approxkl_avg": 83.3494644165039, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7181559205055237, + "step": 778, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9969323873519897 + }, + { + "episode": 12480, + "epoch": 0.22432325556314484, + "loss/policy_avg": 0.15969273447990417, + "lr": 2.8506518404907974e-06, + "objective/entropy": 62.45077896118164, + "objective/kl": 15.108039855957031, + "objective/non_score_reward": -1.5108040571212769, + "objective/rlhf_reward": -5.643216168880462, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.81277465820312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6362130641937256, + "step": 779, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979162216186523 + }, + { + "episode": 12496, + "epoch": 0.22461084948053348, + "loss/policy_avg": -0.1570049226284027, + "lr": 2.8504601226993867e-06, + "objective/entropy": 130.28684997558594, + "objective/kl": 15.398730278015137, + "objective/non_score_reward": -1.5398731231689453, + "objective/rlhf_reward": -3.759492194652557, + "objective/scores": 0.6, + "policy/approxkl_avg": 36.37596893310547, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5687062740325928, + "step": 780, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0024261474609375 + }, + { + "episode": 12512, + "epoch": 0.22489844339792214, + "loss/policy_avg": 0.1517828404903412, + "lr": 2.8502684049079755e-06, + "objective/entropy": 66.80693817138672, + "objective/kl": 7.023012161254883, + "objective/non_score_reward": -0.7023012042045593, + "objective/rlhf_reward": -1.1473453245764835, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 70.58222961425781, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5671324729919434, + "step": 781, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999403953552246 + }, + { + "episode": 12528, + "epoch": 0.22518603731531078, + "loss/policy_avg": 0.3743288516998291, + "lr": 2.8500766871165647e-06, + "objective/entropy": 37.623748779296875, + "objective/kl": 9.943754196166992, + "objective/non_score_reward": -0.994375467300415, + "objective/rlhf_reward": -1.5775016754865643, + "objective/scores": 0.6, + "policy/approxkl_avg": 14.220436096191406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5091394186019897, + "step": 782, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982104301452637 + }, + { + "episode": 12544, + "epoch": 0.22547363123269942, + "loss/policy_avg": 0.4069502055644989, + "lr": 2.8498849693251535e-06, + "objective/entropy": 184.85443115234375, + "objective/kl": 14.495233535766602, + "objective/non_score_reward": -1.4495233297348022, + "objective/rlhf_reward": -7.798093795776367, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.534117221832275, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6540646553039551, + "step": 783, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999969720840454 + }, + { + "episode": 12560, + "epoch": 0.22576122515008806, + "loss/policy_avg": -0.15900185704231262, + "lr": 2.8496932515337423e-06, + "objective/entropy": -7.934391021728516, + "objective/kl": 14.886871337890625, + "objective/non_score_reward": -1.4886871576309204, + "objective/rlhf_reward": -7.954748630523682, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.033672332763672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6111799478530884, + "step": 784, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002901554107666 + }, + { + "episode": 12576, + "epoch": 0.22604881906747673, + "loss/policy_avg": 0.05465098097920418, + "lr": 2.8495015337423316e-06, + "objective/entropy": 184.02117919921875, + "objective/kl": 11.928532600402832, + "objective/non_score_reward": -1.192853331565857, + "objective/rlhf_reward": -4.371413117647171, + "objective/scores": 0.1, + "policy/approxkl_avg": 45.86432647705078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8463940620422363, + "step": 785, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000133752822876 + }, + { + "episode": 12592, + "epoch": 0.22633641298486537, + "loss/policy_avg": 0.3418072462081909, + "lr": 2.8493098159509204e-06, + "objective/entropy": -23.809371948242188, + "objective/kl": 15.163887023925781, + "objective/non_score_reward": -1.5163884162902832, + "objective/rlhf_reward": -8.065553665161133, + "objective/scores": -0.5, + "policy/approxkl_avg": 78.87910461425781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8011717796325684, + "step": 786, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.99837064743042 + }, + { + "episode": 12608, + "epoch": 0.226624006902254, + "loss/policy_avg": -0.3748244047164917, + "lr": 2.8491180981595096e-06, + "objective/entropy": 126.95550537109375, + "objective/kl": 14.842533111572266, + "objective/non_score_reward": -1.4842532873153687, + "objective/rlhf_reward": -5.537013149261474, + "objective/scores": 0.1, + "policy/approxkl_avg": 80.93215942382812, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6581273674964905, + "step": 787, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006587505340576 + }, + { + "episode": 12624, + "epoch": 0.22691160081964268, + "loss/policy_avg": 0.230320543050766, + "lr": 2.8489263803680984e-06, + "objective/entropy": 95.23011779785156, + "objective/kl": 14.606027603149414, + "objective/non_score_reward": -1.4606029987335205, + "objective/rlhf_reward": -4.238291654650288, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 115.39529418945312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.728987455368042, + "step": 788, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998013973236084 + }, + { + "episode": 12640, + "epoch": 0.22719919473703132, + "loss/policy_avg": -0.3885463774204254, + "lr": 2.8487346625766872e-06, + "objective/entropy": 97.80613708496094, + "objective/kl": 11.48002815246582, + "objective/non_score_reward": -1.1480028629302979, + "objective/rlhf_reward": -6.592011451721191, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.53124237060547, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.594038724899292, + "step": 789, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.010626792907715 + }, + { + "episode": 12656, + "epoch": 0.22748678865441996, + "loss/policy_avg": 0.3055153489112854, + "lr": 2.848542944785276e-06, + "objective/entropy": 199.88526916503906, + "objective/kl": 13.171801567077637, + "objective/non_score_reward": -1.3171800374984741, + "objective/rlhf_reward": -3.6068607918625935, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 85.62678527832031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5295522212982178, + "step": 790, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962729215621948 + }, + { + "episode": 12672, + "epoch": 0.2277743825718086, + "loss/policy_avg": 0.18343961238861084, + "lr": 2.848351226993865e-06, + "objective/entropy": 212.48171997070312, + "objective/kl": 14.552225112915039, + "objective/non_score_reward": -1.4552226066589355, + "objective/rlhf_reward": -7.820890426635742, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.826194763183594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7555446028709412, + "step": 791, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981155395507812 + }, + { + "episode": 12688, + "epoch": 0.22806197648919727, + "loss/policy_avg": 1.651806116104126, + "lr": 2.848159509202454e-06, + "objective/entropy": 22.058094024658203, + "objective/kl": 12.990779876708984, + "objective/non_score_reward": -1.2990779876708984, + "objective/rlhf_reward": -4.796312069892883, + "objective/scores": 0.1, + "policy/approxkl_avg": 81.44281005859375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7748013734817505, + "step": 792, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999983549118042 + }, + { + "episode": 12704, + "epoch": 0.2283495704065859, + "loss/policy_avg": 0.020992066711187363, + "lr": 2.847967791411043e-06, + "objective/entropy": 100.20834350585938, + "objective/kl": 20.087814331054688, + "objective/non_score_reward": -2.0087814331054688, + "objective/rlhf_reward": -6.373266016662704, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 137.36184692382812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6344403028488159, + "step": 793, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986518621444702 + }, + { + "episode": 12720, + "epoch": 0.22863716432397455, + "loss/policy_avg": 0.06677938997745514, + "lr": 2.8477760736196317e-06, + "objective/entropy": 7.002399444580078, + "objective/kl": 10.064220428466797, + "objective/non_score_reward": -1.0064222812652588, + "objective/rlhf_reward": -3.62568869292736, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.89893341064453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5471144318580627, + "step": 794, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984047412872314 + }, + { + "episode": 12736, + "epoch": 0.22892475824136319, + "loss/policy_avg": -0.07828734815120697, + "lr": 2.847584355828221e-06, + "objective/entropy": -104.39112854003906, + "objective/kl": 18.24449920654297, + "objective/non_score_reward": -1.824450135231018, + "objective/rlhf_reward": -9.297800064086914, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.5714225769043, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6325054168701172, + "step": 795, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970802068710327 + }, + { + "episode": 12752, + "epoch": 0.22921235215875185, + "loss/policy_avg": 0.28975850343704224, + "lr": 2.8473926380368097e-06, + "objective/entropy": 133.83016967773438, + "objective/kl": 8.593679428100586, + "objective/non_score_reward": -0.8593680262565613, + "objective/rlhf_reward": -5.437472343444824, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.80305004119873, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.729377269744873, + "step": 796, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978275299072266 + }, + { + "episode": 12768, + "epoch": 0.2294999460761405, + "loss/policy_avg": 1.422573208808899, + "lr": 2.847200920245399e-06, + "objective/entropy": 149.96119689941406, + "objective/kl": 16.894641876220703, + "objective/non_score_reward": -1.6894640922546387, + "objective/rlhf_reward": -6.357856726646423, + "objective/scores": 0.1, + "policy/approxkl_avg": 230.54080200195312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8895937204360962, + "step": 797, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0016024112701416 + }, + { + "episode": 12784, + "epoch": 0.22978753999352913, + "loss/policy_avg": 0.19597771763801575, + "lr": 2.8470092024539878e-06, + "objective/entropy": 95.1851577758789, + "objective/kl": 12.648846626281738, + "objective/non_score_reward": -1.2648844718933105, + "objective/rlhf_reward": -7.059537887573242, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.24577522277832, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7796587944030762, + "step": 798, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000627279281616 + }, + { + "episode": 12800, + "epoch": 0.23007513391091777, + "loss/policy_avg": 0.13448825478553772, + "lr": 2.8468174846625766e-06, + "objective/entropy": 102.28286743164062, + "objective/kl": 13.572122573852539, + "objective/non_score_reward": -1.3572125434875488, + "objective/rlhf_reward": -1.0288498461246487, + "objective/scores": 1.1, + "policy/approxkl_avg": 100.61666870117188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6195476055145264, + "step": 799, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980723857879639 + }, + { + "episode": 12816, + "epoch": 0.23036272782830644, + "loss/policy_avg": 0.6187319159507751, + "lr": 2.846625766871166e-06, + "objective/entropy": -39.129539489746094, + "objective/kl": 18.39947509765625, + "objective/non_score_reward": -1.8399477005004883, + "objective/rlhf_reward": -2.9597911596298214, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.12504577636719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5955438613891602, + "step": 800, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001791000366211 + }, + { + "episode": 12832, + "epoch": 0.23065032174569508, + "loss/policy_avg": 0.3273153305053711, + "lr": 2.8464340490797546e-06, + "objective/entropy": 13.417747497558594, + "objective/kl": 7.873808860778809, + "objective/non_score_reward": -0.7873809337615967, + "objective/rlhf_reward": -1.324694927009653, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 17.407360076904297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5497540235519409, + "step": 801, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995551109313965 + }, + { + "episode": 12848, + "epoch": 0.23093791566308372, + "loss/policy_avg": 0.3061869144439697, + "lr": 2.8462423312883434e-06, + "objective/entropy": -106.52912902832031, + "objective/kl": 10.210708618164062, + "objective/non_score_reward": -1.0210708379745483, + "objective/rlhf_reward": -6.084282875061035, + "objective/scores": -0.5, + "policy/approxkl_avg": 96.2752456665039, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4860496520996094, + "step": 802, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9976447820663452 + }, + { + "episode": 12864, + "epoch": 0.23122550958047236, + "loss/policy_avg": 0.1686146855354309, + "lr": 2.8460506134969327e-06, + "objective/entropy": 29.669536590576172, + "objective/kl": 18.03110122680664, + "objective/non_score_reward": -1.803110122680664, + "objective/rlhf_reward": -4.288721655250761, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 38.228782653808594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8174216151237488, + "step": 803, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982223510742188 + }, + { + "episode": 12880, + "epoch": 0.23151310349786103, + "loss/policy_avg": 0.25656819343566895, + "lr": 2.8458588957055215e-06, + "objective/entropy": -39.153350830078125, + "objective/kl": 9.698554039001465, + "objective/non_score_reward": -0.9698554277420044, + "objective/rlhf_reward": 0.5205781698226932, + "objective/scores": 1.1, + "policy/approxkl_avg": 9.279545783996582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4837508201599121, + "step": 804, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997928142547607 + }, + { + "episode": 12896, + "epoch": 0.23180069741524967, + "loss/policy_avg": 0.08962078392505646, + "lr": 2.8456671779141107e-06, + "objective/entropy": 91.72572326660156, + "objective/kl": 9.199845314025879, + "objective/non_score_reward": -0.9199845790863037, + "objective/rlhf_reward": -1.9466050426165262, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 36.79070281982422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7538923025131226, + "step": 805, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967260360717773 + }, + { + "episode": 12912, + "epoch": 0.2320882913326383, + "loss/policy_avg": 0.2165328860282898, + "lr": 2.8454754601226995e-06, + "objective/entropy": 141.5177001953125, + "objective/kl": 16.128833770751953, + "objective/non_score_reward": -1.61288321018219, + "objective/rlhf_reward": -4.328826548830543, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 8.008130073547363, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.76326584815979, + "step": 806, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9970195293426514 + }, + { + "episode": 12928, + "epoch": 0.23237588525002698, + "loss/policy_avg": 0.33014577627182007, + "lr": 2.8452837423312883e-06, + "objective/entropy": -7.563770294189453, + "objective/kl": 15.92538070678711, + "objective/non_score_reward": -1.5925382375717163, + "objective/rlhf_reward": -8.370153427124023, + "objective/scores": -0.5, + "policy/approxkl_avg": 122.01593017578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6476025581359863, + "step": 807, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975717067718506 + }, + { + "episode": 12944, + "epoch": 0.23266347916741562, + "loss/policy_avg": 0.2228621542453766, + "lr": 2.8450920245398776e-06, + "objective/entropy": -146.32801818847656, + "objective/kl": 14.984716415405273, + "objective/non_score_reward": -1.498471736907959, + "objective/rlhf_reward": -7.993886947631836, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.571311950683594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6879932880401611, + "step": 808, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982060194015503 + }, + { + "episode": 12960, + "epoch": 0.23295107308480426, + "loss/policy_avg": -0.1637599915266037, + "lr": 2.8449003067484664e-06, + "objective/entropy": -171.0908203125, + "objective/kl": 4.425614833831787, + "objective/non_score_reward": -0.44256141781806946, + "objective/rlhf_reward": -1.370245734602213, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.86245059967041, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.36712339520454407, + "step": 809, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001098871231079 + }, + { + "episode": 12976, + "epoch": 0.2332386670021929, + "loss/policy_avg": 0.16715390980243683, + "lr": 2.8447085889570556e-06, + "objective/entropy": -126.9478530883789, + "objective/kl": 13.796289443969727, + "objective/non_score_reward": -1.379629135131836, + "objective/rlhf_reward": -5.118516108393669, + "objective/scores": 0.1, + "policy/approxkl_avg": 139.91973876953125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6227424144744873, + "step": 810, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997196197509766 + }, + { + "episode": 12992, + "epoch": 0.23352626091958156, + "loss/policy_avg": 0.8460204601287842, + "lr": 2.8445168711656444e-06, + "objective/entropy": -108.99869537353516, + "objective/kl": 14.032926559448242, + "objective/non_score_reward": -1.4032926559448242, + "objective/rlhf_reward": -3.2131706982851025, + "objective/scores": 0.6, + "policy/approxkl_avg": 148.4869384765625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5060499906539917, + "step": 811, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975128173828125 + }, + { + "episode": 13008, + "epoch": 0.2338138548369702, + "loss/policy_avg": 0.5107466578483582, + "lr": 2.8443251533742332e-06, + "objective/entropy": 26.51531219482422, + "objective/kl": 12.1517333984375, + "objective/non_score_reward": -1.215173363685608, + "objective/rlhf_reward": -2.4606932833790776, + "objective/scores": 0.6, + "policy/approxkl_avg": 44.8768310546875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6984570622444153, + "step": 812, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982036352157593 + }, + { + "episode": 13024, + "epoch": 0.23410144875435884, + "loss/policy_avg": 0.5176931619644165, + "lr": 2.844133435582822e-06, + "objective/entropy": 87.28473663330078, + "objective/kl": 11.600730895996094, + "objective/non_score_reward": -1.160073161125183, + "objective/rlhf_reward": -4.240292406082153, + "objective/scores": 0.1, + "policy/approxkl_avg": 86.09700012207031, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5015227794647217, + "step": 813, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994313716888428 + }, + { + "episode": 13040, + "epoch": 0.23438904267174748, + "loss/policy_avg": 0.2525648772716522, + "lr": 2.843941717791411e-06, + "objective/entropy": -90.09027099609375, + "objective/kl": 17.53409767150879, + "objective/non_score_reward": -1.7534098625183105, + "objective/rlhf_reward": -5.188810522827218, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 133.2302703857422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6975289583206177, + "step": 814, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996279239654541 + }, + { + "episode": 13056, + "epoch": 0.23467663658913615, + "loss/policy_avg": 0.0005577714182436466, + "lr": 2.84375e-06, + "objective/entropy": 87.36531829833984, + "objective/kl": 8.974178314208984, + "objective/non_score_reward": -0.8974178433418274, + "objective/rlhf_reward": -3.189671283960342, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.3939369916915894, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5556995868682861, + "step": 815, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007152557373047 + }, + { + "episode": 13072, + "epoch": 0.2349642305065248, + "loss/policy_avg": -0.10415857285261154, + "lr": 2.843558282208589e-06, + "objective/entropy": 281.04083251953125, + "objective/kl": 10.391765594482422, + "objective/non_score_reward": -1.0391765832901, + "objective/rlhf_reward": -3.7567061990499493, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.441307067871094, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.8066496849060059, + "step": 816, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001943588256836 + }, + { + "episode": 13088, + "epoch": 0.23525182442391343, + "loss/policy_avg": 0.2472541332244873, + "lr": 2.8433665644171777e-06, + "objective/entropy": 124.40581512451172, + "objective/kl": 11.622451782226562, + "objective/non_score_reward": -1.1622451543807983, + "objective/rlhf_reward": -2.248980677127838, + "objective/scores": 0.6, + "policy/approxkl_avg": 6.343780517578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5752452611923218, + "step": 817, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999915599822998 + }, + { + "episode": 13104, + "epoch": 0.23553941834130207, + "loss/policy_avg": 0.2929553687572479, + "lr": 2.843174846625767e-06, + "objective/entropy": -18.095306396484375, + "objective/kl": 10.766685485839844, + "objective/non_score_reward": -1.076668620109558, + "objective/rlhf_reward": -6.306674480438232, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.248212814331055, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.44631391763687134, + "step": 818, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986329078674316 + }, + { + "episode": 13120, + "epoch": 0.23582701225869074, + "loss/policy_avg": 0.2811235189437866, + "lr": 2.8429831288343558e-06, + "objective/entropy": -93.74215698242188, + "objective/kl": 10.5712890625, + "objective/non_score_reward": -1.05712890625, + "objective/rlhf_reward": -3.8285156697034832, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.029077529907227, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6386555433273315, + "step": 819, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0000128746032715 + }, + { + "episode": 13136, + "epoch": 0.23611460617607938, + "loss/policy_avg": 0.321929007768631, + "lr": 2.842791411042945e-06, + "objective/entropy": -222.2208251953125, + "objective/kl": 10.463525772094727, + "objective/non_score_reward": -1.0463526248931885, + "objective/rlhf_reward": -3.7854103505611416, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.59898376464844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7040956616401672, + "step": 820, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0001721382141113 + }, + { + "episode": 13152, + "epoch": 0.23640220009346802, + "loss/policy_avg": 0.2252596765756607, + "lr": 2.842599693251534e-06, + "objective/entropy": -5.484672546386719, + "objective/kl": 7.207294940948486, + "objective/non_score_reward": -0.7207294702529907, + "objective/rlhf_reward": 0.04080113327386714, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 4.542896270751953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5736124515533447, + "step": 821, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999035120010376 + }, + { + "episode": 13168, + "epoch": 0.23668979401085666, + "loss/policy_avg": 0.17349383234977722, + "lr": 2.8424079754601226e-06, + "objective/entropy": 260.90020751953125, + "objective/kl": 13.826977729797363, + "objective/non_score_reward": -1.3826978206634521, + "objective/rlhf_reward": -7.530791282653809, + "objective/scores": -0.5, + "policy/approxkl_avg": 39.94245529174805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8797262907028198, + "step": 822, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994349479675293 + }, + { + "episode": 13184, + "epoch": 0.23697738792824533, + "loss/policy_avg": -0.0034087272360920906, + "lr": 2.842216257668712e-06, + "objective/entropy": -51.63107681274414, + "objective/kl": 10.85478401184082, + "objective/non_score_reward": -1.0854783058166504, + "objective/rlhf_reward": 0.058087015151977894, + "objective/scores": 1.1, + "policy/approxkl_avg": 9.927780151367188, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6021898984909058, + "step": 823, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001553535461426 + }, + { + "episode": 13200, + "epoch": 0.23726498184563397, + "loss/policy_avg": 0.42283856868743896, + "lr": 2.8420245398773006e-06, + "objective/entropy": 200.68020629882812, + "objective/kl": 10.381429672241211, + "objective/non_score_reward": -1.0381429195404053, + "objective/rlhf_reward": -6.152571678161621, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.65203857421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7309496998786926, + "step": 824, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9966254234313965 + }, + { + "episode": 13216, + "epoch": 0.2375525757630226, + "loss/policy_avg": -0.09755183756351471, + "lr": 2.84183282208589e-06, + "objective/entropy": -69.01538848876953, + "objective/kl": 5.6848883628845215, + "objective/non_score_reward": -0.5684888362884521, + "objective/rlhf_reward": 2.1260446399450306, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.0148842334747314, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.44518405199050903, + "step": 825, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0009875297546387 + }, + { + "episode": 13232, + "epoch": 0.23784016968041127, + "loss/policy_avg": -0.25965848565101624, + "lr": 2.8416411042944787e-06, + "objective/entropy": 171.923828125, + "objective/kl": 8.58846664428711, + "objective/non_score_reward": -0.8588467836380005, + "objective/rlhf_reward": -3.035387037694454, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.28413391113281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5065795183181763, + "step": 826, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.007476806640625 + }, + { + "episode": 13248, + "epoch": 0.2381277635977999, + "loss/policy_avg": 0.05290607735514641, + "lr": 2.8414493865030675e-06, + "objective/entropy": 31.982357025146484, + "objective/kl": 14.847391128540039, + "objective/non_score_reward": -1.484739065170288, + "objective/rlhf_reward": -5.538956558704376, + "objective/scores": 0.1, + "policy/approxkl_avg": 134.27171325683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3722934126853943, + "step": 827, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999661922454834 + }, + { + "episode": 13264, + "epoch": 0.23841535751518855, + "loss/policy_avg": 0.2654344141483307, + "lr": 2.8412576687116567e-06, + "objective/entropy": 174.80413818359375, + "objective/kl": 10.663211822509766, + "objective/non_score_reward": -1.0663211345672607, + "objective/rlhf_reward": -3.8652845233678814, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.65331268310547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6540871262550354, + "step": 828, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004587173461914 + }, + { + "episode": 13280, + "epoch": 0.2387029514325772, + "loss/policy_avg": 0.29667001962661743, + "lr": 2.8410659509202455e-06, + "objective/entropy": -67.4645004272461, + "objective/kl": 11.012588500976562, + "objective/non_score_reward": -1.1012588739395142, + "objective/rlhf_reward": -4.005035495758056, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.841941833496094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6608229279518127, + "step": 829, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9974405765533447 + }, + { + "episode": 13296, + "epoch": 0.23899054534996586, + "loss/policy_avg": 0.0870223194360733, + "lr": 2.8408742331288343e-06, + "objective/entropy": 218.6323699951172, + "objective/kl": 13.182598114013672, + "objective/non_score_reward": -1.3182597160339355, + "objective/rlhf_reward": -3.6111795954114063, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 191.98348999023438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8216352462768555, + "step": 830, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983137845993042 + }, + { + "episode": 13312, + "epoch": 0.2392781392673545, + "loss/policy_avg": 0.10631455481052399, + "lr": 2.8406825153374236e-06, + "objective/entropy": -176.3839569091797, + "objective/kl": 10.902888298034668, + "objective/non_score_reward": -1.0902888774871826, + "objective/rlhf_reward": -6.3611555099487305, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.94891929626465, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6645264029502869, + "step": 831, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989513158798218 + }, + { + "episode": 13328, + "epoch": 0.23956573318474314, + "loss/policy_avg": 0.5899176001548767, + "lr": 2.8404907975460124e-06, + "objective/entropy": 200.20742797851562, + "objective/kl": 18.23828125, + "objective/non_score_reward": -1.8238282203674316, + "objective/rlhf_reward": -9.295312881469727, + "objective/scores": -0.5, + "policy/approxkl_avg": 94.45753479003906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5188368558883667, + "step": 832, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002236366271973 + }, + { + "episode": 13344, + "epoch": 0.23985332710213178, + "loss/policy_avg": 0.34776580333709717, + "lr": 2.8402990797546016e-06, + "objective/entropy": 3.6189041137695312, + "objective/kl": 14.347872734069824, + "objective/non_score_reward": -1.4347872734069824, + "objective/rlhf_reward": -5.339149034023285, + "objective/scores": 0.1, + "policy/approxkl_avg": 63.92158889770508, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49274176359176636, + "step": 833, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999204158782959 + }, + { + "episode": 13360, + "epoch": 0.24014092101952045, + "loss/policy_avg": -0.2418670505285263, + "lr": 2.84010736196319e-06, + "objective/entropy": 307.7858581542969, + "objective/kl": 11.471115112304688, + "objective/non_score_reward": -1.1471115350723267, + "objective/rlhf_reward": -4.188446259498596, + "objective/scores": 0.1, + "policy/approxkl_avg": 50.762351989746094, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7961260676383972, + "step": 834, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0257229804992676 + }, + { + "episode": 13376, + "epoch": 0.2404285149369091, + "loss/policy_avg": 0.4314028024673462, + "lr": 2.8399156441717792e-06, + "objective/entropy": 106.43098449707031, + "objective/kl": 12.59414291381836, + "objective/non_score_reward": -1.2594143152236938, + "objective/rlhf_reward": -0.6376570820808407, + "objective/scores": 1.1, + "policy/approxkl_avg": 49.60033416748047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6837184429168701, + "step": 835, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000203847885132 + }, + { + "episode": 13392, + "epoch": 0.24071610885429773, + "loss/policy_avg": -0.4717941880226135, + "lr": 2.839723926380368e-06, + "objective/entropy": 156.29014587402344, + "objective/kl": 13.116241455078125, + "objective/non_score_reward": -1.3116241693496704, + "objective/rlhf_reward": -3.690237252917841, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 150.56942749023438, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6304831504821777, + "step": 836, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002359390258789 + }, + { + "episode": 13408, + "epoch": 0.24100370277168637, + "loss/policy_avg": 0.48564639687538147, + "lr": 2.839532208588957e-06, + "objective/entropy": 25.292335510253906, + "objective/kl": 10.876035690307617, + "objective/non_score_reward": -1.0876035690307617, + "objective/rlhf_reward": -2.403002808766301, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 81.79934692382812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7601375579833984, + "step": 837, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0018410682678223 + }, + { + "episode": 13424, + "epoch": 0.24129129668907504, + "loss/policy_avg": 0.2227717936038971, + "lr": 2.839340490797546e-06, + "objective/entropy": 290.1324157714844, + "objective/kl": 8.968036651611328, + "objective/non_score_reward": -0.896803617477417, + "objective/rlhf_reward": -5.587214469909668, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.772152900695801, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6874889135360718, + "step": 838, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978574514389038 + }, + { + "episode": 13440, + "epoch": 0.24157889060646368, + "loss/policy_avg": 0.378933846950531, + "lr": 2.839148773006135e-06, + "objective/entropy": 83.87255096435547, + "objective/kl": 16.091838836669922, + "objective/non_score_reward": -1.6091837882995605, + "objective/rlhf_reward": -8.436735153198242, + "objective/scores": -0.5, + "policy/approxkl_avg": 171.8421630859375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7673331499099731, + "step": 839, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988526105880737 + }, + { + "episode": 13456, + "epoch": 0.24186648452385232, + "loss/policy_avg": 0.05077691376209259, + "lr": 2.838957055214724e-06, + "objective/entropy": 201.8306884765625, + "objective/kl": 10.442170143127441, + "objective/non_score_reward": -1.0442171096801758, + "objective/rlhf_reward": -6.176868438720703, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.752803802490234, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7712172269821167, + "step": 840, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.1516284942626953 + }, + { + "episode": 13472, + "epoch": 0.24215407844124096, + "loss/policy_avg": 0.6697203516960144, + "lr": 2.838765337423313e-06, + "objective/entropy": -228.6722869873047, + "objective/kl": 12.05379867553711, + "objective/non_score_reward": -1.2053799629211426, + "objective/rlhf_reward": -4.421519672870636, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.305578231811523, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6277028918266296, + "step": 841, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998668909072876 + }, + { + "episode": 13488, + "epoch": 0.24244167235862962, + "loss/policy_avg": -0.059078969061374664, + "lr": 2.8385736196319018e-06, + "objective/entropy": 41.68458557128906, + "objective/kl": 9.514412879943848, + "objective/non_score_reward": -0.9514412879943848, + "objective/rlhf_reward": -3.4057652115821835, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.8853363990783691, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6527504324913025, + "step": 842, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000281810760498 + }, + { + "episode": 13504, + "epoch": 0.24272926627601826, + "loss/policy_avg": 0.03922227397561073, + "lr": 2.838381901840491e-06, + "objective/entropy": -126.3974838256836, + "objective/kl": 15.604471206665039, + "objective/non_score_reward": -1.560447096824646, + "objective/rlhf_reward": -5.841788208484649, + "objective/scores": 0.1, + "policy/approxkl_avg": 42.71725082397461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5101525187492371, + "step": 843, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998733401298523 + }, + { + "episode": 13520, + "epoch": 0.2430168601934069, + "loss/policy_avg": -0.07494255900382996, + "lr": 2.83819018404908e-06, + "objective/entropy": -109.69844055175781, + "objective/kl": 12.058096885681152, + "objective/non_score_reward": -1.2058095932006836, + "objective/rlhf_reward": -4.423238492012024, + "objective/scores": 0.1, + "policy/approxkl_avg": 71.72029876708984, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6301126480102539, + "step": 844, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0005083084106445 + }, + { + "episode": 13536, + "epoch": 0.24330445411079557, + "loss/policy_avg": -0.0021638330072164536, + "lr": 2.8379984662576686e-06, + "objective/entropy": -55.14314651489258, + "objective/kl": 16.980009078979492, + "objective/non_score_reward": -1.6980011463165283, + "objective/rlhf_reward": -6.392004287242889, + "objective/scores": 0.1, + "policy/approxkl_avg": 58.57079315185547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5378575921058655, + "step": 845, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998762607574463 + }, + { + "episode": 13552, + "epoch": 0.2435920480281842, + "loss/policy_avg": 0.18776272237300873, + "lr": 2.837806748466258e-06, + "objective/entropy": 213.31375122070312, + "objective/kl": 13.382487297058105, + "objective/non_score_reward": -1.3382488489151, + "objective/rlhf_reward": -0.9529952764511105, + "objective/scores": 1.1, + "policy/approxkl_avg": 47.32990646362305, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5546428561210632, + "step": 846, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999443769454956 + }, + { + "episode": 13568, + "epoch": 0.24387964194557285, + "loss/policy_avg": -0.10050228238105774, + "lr": 2.8376150306748467e-06, + "objective/entropy": -1.5470504760742188, + "objective/kl": 5.421267509460449, + "objective/non_score_reward": -0.5421267151832581, + "objective/rlhf_reward": -1.768506808578968, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.883844375610352, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.690306544303894, + "step": 847, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0045018196105957 + }, + { + "episode": 13584, + "epoch": 0.2441672358629615, + "loss/policy_avg": 0.14597558975219727, + "lr": 2.837423312883436e-06, + "objective/entropy": -17.603618621826172, + "objective/kl": 11.631009101867676, + "objective/non_score_reward": -1.1631009578704834, + "objective/rlhf_reward": -4.252403473854065, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.45585250854492, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5793694257736206, + "step": 848, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997261643409729 + }, + { + "episode": 13600, + "epoch": 0.24445482978035016, + "loss/policy_avg": 0.3591553866863251, + "lr": 2.8372315950920247e-06, + "objective/entropy": 140.2003173828125, + "objective/kl": 16.13811492919922, + "objective/non_score_reward": -1.613811731338501, + "objective/rlhf_reward": -6.055246709287166, + "objective/scores": 0.1, + "policy/approxkl_avg": 67.97541809082031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7342993021011353, + "step": 849, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992564916610718 + }, + { + "episode": 13616, + "epoch": 0.2447424236977388, + "loss/policy_avg": 0.30371835827827454, + "lr": 2.8370398773006135e-06, + "objective/entropy": 231.95297241210938, + "objective/kl": 17.194393157958984, + "objective/non_score_reward": -1.7194395065307617, + "objective/rlhf_reward": -4.755051466003929, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 10.407341003417969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5657638311386108, + "step": 850, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007481575012207 + }, + { + "episode": 13632, + "epoch": 0.24503001761512744, + "loss/policy_avg": 0.5038444399833679, + "lr": 2.8368481595092027e-06, + "objective/entropy": 60.03744125366211, + "objective/kl": 14.280261993408203, + "objective/non_score_reward": -1.4280261993408203, + "objective/rlhf_reward": -3.764693687634404, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 37.566680908203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611575186252594, + "step": 851, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986273050308228 + }, + { + "episode": 13648, + "epoch": 0.24531761153251608, + "loss/policy_avg": 0.25327011942863464, + "lr": 2.8366564417177915e-06, + "objective/entropy": 80.94923400878906, + "objective/kl": 13.566845893859863, + "objective/non_score_reward": -1.356684684753418, + "objective/rlhf_reward": -1.0267389029264447, + "objective/scores": 1.1, + "policy/approxkl_avg": 30.468521118164062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.743263840675354, + "step": 852, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002877712249756 + }, + { + "episode": 13664, + "epoch": 0.24560520544990475, + "loss/policy_avg": 0.7088375091552734, + "lr": 2.8364647239263804e-06, + "objective/entropy": -19.6234130859375, + "objective/kl": 18.899595260620117, + "objective/non_score_reward": -1.889959454536438, + "objective/rlhf_reward": -5.735009069713662, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 87.53837585449219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6533883213996887, + "step": 853, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973728656768799 + }, + { + "episode": 13680, + "epoch": 0.24589279936729339, + "loss/policy_avg": 0.21842418611049652, + "lr": 2.8362730061349696e-06, + "objective/entropy": -2.3630218505859375, + "objective/kl": 10.437822341918945, + "objective/non_score_reward": -1.043782353401184, + "objective/rlhf_reward": -1.775129473209381, + "objective/scores": 0.6, + "policy/approxkl_avg": 25.123273849487305, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4626666307449341, + "step": 854, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997999906539917 + }, + { + "episode": 13696, + "epoch": 0.24618039328468203, + "loss/policy_avg": 0.30842268466949463, + "lr": 2.8360812883435584e-06, + "objective/entropy": 73.10386657714844, + "objective/kl": 14.02588176727295, + "objective/non_score_reward": -1.402587890625, + "objective/rlhf_reward": -7.6103515625, + "objective/scores": -0.5, + "policy/approxkl_avg": 44.31865692138672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47557830810546875, + "step": 855, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000145435333252 + }, + { + "episode": 13712, + "epoch": 0.24646798720207067, + "loss/policy_avg": 0.1986391693353653, + "lr": 2.835889570552147e-06, + "objective/entropy": 41.59130096435547, + "objective/kl": 11.414254188537598, + "objective/non_score_reward": -1.141425371170044, + "objective/rlhf_reward": -6.565701484680176, + "objective/scores": -0.5, + "policy/approxkl_avg": 57.20241165161133, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5653365850448608, + "step": 856, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001561164855957 + }, + { + "episode": 13728, + "epoch": 0.24675558111945933, + "loss/policy_avg": -0.09086053818464279, + "lr": 2.835697852760736e-06, + "objective/entropy": 129.108154296875, + "objective/kl": 11.600625038146973, + "objective/non_score_reward": -1.160062551498413, + "objective/rlhf_reward": -4.240250265598297, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.625953674316406, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8041683435440063, + "step": 857, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001258373260498 + }, + { + "episode": 13744, + "epoch": 0.24704317503684797, + "loss/policy_avg": 0.4292464852333069, + "lr": 2.8355061349693253e-06, + "objective/entropy": -7.69146728515625, + "objective/kl": 8.978507995605469, + "objective/non_score_reward": -0.8978508710861206, + "objective/rlhf_reward": -0.6676843806516852, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 29.219491958618164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.43980222940444946, + "step": 858, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975333213806152 + }, + { + "episode": 13760, + "epoch": 0.2473307689542366, + "loss/policy_avg": -0.1740269660949707, + "lr": 2.835314417177914e-06, + "objective/entropy": 199.1434783935547, + "objective/kl": 10.360536575317383, + "objective/non_score_reward": -1.0360536575317383, + "objective/rlhf_reward": -3.74421471953392, + "objective/scores": 0.1, + "policy/approxkl_avg": 45.653465270996094, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.46243250370025635, + "step": 859, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001521110534668 + }, + { + "episode": 13776, + "epoch": 0.24761836287162525, + "loss/policy_avg": 0.11918877065181732, + "lr": 2.835122699386503e-06, + "objective/entropy": 72.46167755126953, + "objective/kl": 14.164287567138672, + "objective/non_score_reward": -1.416428565979004, + "objective/rlhf_reward": -7.665714263916016, + "objective/scores": -0.5, + "policy/approxkl_avg": 134.235107421875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4980497360229492, + "step": 860, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996706247329712 + }, + { + "episode": 13792, + "epoch": 0.24790595678901392, + "loss/policy_avg": 0.05426352471113205, + "lr": 2.834930981595092e-06, + "objective/entropy": 4.267814636230469, + "objective/kl": 14.633834838867188, + "objective/non_score_reward": -1.463383436203003, + "objective/rlhf_reward": -1.453533565998077, + "objective/scores": 1.1, + "policy/approxkl_avg": 25.47060203552246, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.566750168800354, + "step": 861, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998802900314331 + }, + { + "episode": 13808, + "epoch": 0.24819355070640256, + "loss/policy_avg": 0.5575066804885864, + "lr": 2.834739263803681e-06, + "objective/entropy": 206.50341796875, + "objective/kl": 13.636398315429688, + "objective/non_score_reward": -1.3636398315429688, + "objective/rlhf_reward": -1.0545593261718746, + "objective/scores": 1.1, + "policy/approxkl_avg": 26.75678062438965, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4932931065559387, + "step": 862, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983261823654175 + }, + { + "episode": 13824, + "epoch": 0.2484811446237912, + "loss/policy_avg": 0.23257222771644592, + "lr": 2.83454754601227e-06, + "objective/entropy": -39.31481170654297, + "objective/kl": 14.039112091064453, + "objective/non_score_reward": -1.4039111137390137, + "objective/rlhf_reward": -7.615644454956055, + "objective/scores": -0.5, + "policy/approxkl_avg": 62.90751647949219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.643797755241394, + "step": 863, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9972407817840576 + }, + { + "episode": 13840, + "epoch": 0.24876873854117984, + "loss/policy_avg": 0.02133660763502121, + "lr": 2.834355828220859e-06, + "objective/entropy": -178.11383056640625, + "objective/kl": 11.982830047607422, + "objective/non_score_reward": -1.1982829570770264, + "objective/rlhf_reward": -4.393131679296493, + "objective/scores": 0.1, + "policy/approxkl_avg": 53.20249938964844, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7372720241546631, + "step": 864, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001306533813477 + }, + { + "episode": 13856, + "epoch": 0.2490563324585685, + "loss/policy_avg": 0.11362017691135406, + "lr": 2.8341641104294478e-06, + "objective/entropy": 79.2239990234375, + "objective/kl": 13.957067489624023, + "objective/non_score_reward": -1.3957067728042603, + "objective/rlhf_reward": -5.182827150821685, + "objective/scores": 0.1, + "policy/approxkl_avg": 67.16444396972656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6697244048118591, + "step": 865, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998111367225647 + }, + { + "episode": 13872, + "epoch": 0.24934392637595715, + "loss/policy_avg": -0.15431474149227142, + "lr": 2.833972392638037e-06, + "objective/entropy": 85.38504028320312, + "objective/kl": 12.748974800109863, + "objective/non_score_reward": -1.274897575378418, + "objective/rlhf_reward": -4.699590167403221, + "objective/scores": 0.1, + "policy/approxkl_avg": 95.32827758789062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4564756155014038, + "step": 866, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008811950683594 + }, + { + "episode": 13888, + "epoch": 0.2496315202933458, + "loss/policy_avg": 0.5219398140907288, + "lr": 2.833780674846626e-06, + "objective/entropy": -68.0516128540039, + "objective/kl": 10.130060195922852, + "objective/non_score_reward": -1.0130060911178589, + "objective/rlhf_reward": -3.6520243942737576, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.153987884521484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.576066255569458, + "step": 867, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983216524124146 + }, + { + "episode": 13904, + "epoch": 0.24991911421073446, + "loss/policy_avg": -0.27920252084732056, + "lr": 2.8335889570552146e-06, + "objective/entropy": 37.78301239013672, + "objective/kl": 8.389419555664062, + "objective/non_score_reward": -0.8389419317245483, + "objective/rlhf_reward": -5.355767726898193, + "objective/scores": -0.5, + "policy/approxkl_avg": 48.89885711669922, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.3526889383792877, + "step": 868, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0450096130371094 + }, + { + "episode": 13920, + "epoch": 0.25020670812812307, + "loss/policy_avg": 0.23919862508773804, + "lr": 2.833397239263804e-06, + "objective/entropy": 96.82152557373047, + "objective/kl": 14.15482234954834, + "objective/non_score_reward": -1.415482521057129, + "objective/rlhf_reward": -5.261929965019226, + "objective/scores": 0.1, + "policy/approxkl_avg": 81.41618347167969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7077710628509521, + "step": 869, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002362728118896 + }, + { + "episode": 13936, + "epoch": 0.25049430204551176, + "loss/policy_avg": -0.008724374696612358, + "lr": 2.8332055214723927e-06, + "objective/entropy": 5.678382873535156, + "objective/kl": 8.6428861618042, + "objective/non_score_reward": -0.8642886877059937, + "objective/rlhf_reward": -0.5334355577242103, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 18.883581161499023, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6166142225265503, + "step": 870, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976524114608765 + }, + { + "episode": 13952, + "epoch": 0.2507818959629004, + "loss/policy_avg": 0.5849788188934326, + "lr": 2.833013803680982e-06, + "objective/entropy": 7.5515899658203125, + "objective/kl": 12.30251693725586, + "objective/non_score_reward": -1.2302517890930176, + "objective/rlhf_reward": -3.259147563547479, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 45.453224182128906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8170486688613892, + "step": 871, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998389482498169 + }, + { + "episode": 13968, + "epoch": 0.25106948988028904, + "loss/policy_avg": 1.0887643098831177, + "lr": 2.8328220858895707e-06, + "objective/entropy": 19.560287475585938, + "objective/kl": 9.816727638244629, + "objective/non_score_reward": -0.9816729426383972, + "objective/rlhf_reward": -3.526691591739654, + "objective/scores": 0.1, + "policy/approxkl_avg": 59.10675048828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4642179012298584, + "step": 872, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975433349609375 + }, + { + "episode": 13984, + "epoch": 0.2513570837976777, + "loss/policy_avg": 0.27298757433891296, + "lr": 2.8326303680981595e-06, + "objective/entropy": -61.813533782958984, + "objective/kl": 14.266277313232422, + "objective/non_score_reward": -1.4266278743743896, + "objective/rlhf_reward": -3.3065116763114926, + "objective/scores": 0.6, + "policy/approxkl_avg": 34.11764907836914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49072712659835815, + "step": 873, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989274740219116 + }, + { + "episode": 14000, + "epoch": 0.2516446777150663, + "loss/policy_avg": 0.22000867128372192, + "lr": 2.8324386503067487e-06, + "objective/entropy": -82.48310089111328, + "objective/kl": 12.360732078552246, + "objective/non_score_reward": -1.2360732555389404, + "objective/rlhf_reward": -6.944293022155762, + "objective/scores": -0.5, + "policy/approxkl_avg": 76.64207458496094, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4945850372314453, + "step": 874, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999876856803894 + }, + { + "episode": 14016, + "epoch": 0.25193227163245496, + "loss/policy_avg": 0.2379000037908554, + "lr": 2.8322469325153376e-06, + "objective/entropy": 11.343524932861328, + "objective/kl": 13.053705215454102, + "objective/non_score_reward": -1.305370569229126, + "objective/rlhf_reward": -7.221482276916504, + "objective/scores": -0.5, + "policy/approxkl_avg": 109.89395141601562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5588850975036621, + "step": 875, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9999418258666992 + }, + { + "episode": 14032, + "epoch": 0.2522198655498436, + "loss/policy_avg": 0.48280632495880127, + "lr": 2.8320552147239268e-06, + "objective/entropy": 169.83905029296875, + "objective/kl": 12.475770950317383, + "objective/non_score_reward": -1.2475769519805908, + "objective/rlhf_reward": -3.042896787600453, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 133.74017333984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.40729820728302, + "step": 876, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9965707063674927 + }, + { + "episode": 14048, + "epoch": 0.25250745946723224, + "loss/policy_avg": 0.5907744765281677, + "lr": 2.8318634969325156e-06, + "objective/entropy": 206.38848876953125, + "objective/kl": 10.661521911621094, + "objective/non_score_reward": -1.0661522150039673, + "objective/rlhf_reward": -3.8646090537309643, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.813899993896484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.71628737449646, + "step": 877, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990339279174805 + }, + { + "episode": 14064, + "epoch": 0.25279505338462094, + "loss/policy_avg": 0.2785950005054474, + "lr": 2.8316717791411044e-06, + "objective/entropy": 162.6572723388672, + "objective/kl": 12.341489791870117, + "objective/non_score_reward": -1.2341489791870117, + "objective/rlhf_reward": -2.012877260090086, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 85.18682861328125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8230720162391663, + "step": 878, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007762908935547 + }, + { + "episode": 14080, + "epoch": 0.2530826473020096, + "loss/policy_avg": 0.5439757108688354, + "lr": 2.8314800613496932e-06, + "objective/entropy": -29.994464874267578, + "objective/kl": 9.197661399841309, + "objective/non_score_reward": -0.9197661876678467, + "objective/rlhf_reward": -3.279064661264419, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.833585739135742, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.591437816619873, + "step": 879, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997772216796875 + }, + { + "episode": 14096, + "epoch": 0.2533702412193982, + "loss/policy_avg": 0.41080114245414734, + "lr": 2.831288343558282e-06, + "objective/entropy": 239.91580200195312, + "objective/kl": 9.134315490722656, + "objective/non_score_reward": -0.9134315252304077, + "objective/rlhf_reward": -5.653726100921631, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.719074249267578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5987927913665771, + "step": 880, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004446506500244 + }, + { + "episode": 14112, + "epoch": 0.25365783513678686, + "loss/policy_avg": 0.6375956535339355, + "lr": 2.8310966257668713e-06, + "objective/entropy": -158.66209411621094, + "objective/kl": 10.0148344039917, + "objective/non_score_reward": -1.0014833211898804, + "objective/rlhf_reward": -1.082214523793432, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 21.38302230834961, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6316667795181274, + "step": 881, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0006909370422363 + }, + { + "episode": 14128, + "epoch": 0.2539454290541755, + "loss/policy_avg": 0.45256638526916504, + "lr": 2.83090490797546e-06, + "objective/entropy": 207.23558044433594, + "objective/kl": 18.251401901245117, + "objective/non_score_reward": -1.8251402378082275, + "objective/rlhf_reward": -4.90056095123291, + "objective/scores": 0.6, + "policy/approxkl_avg": 26.723499298095703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5192215442657471, + "step": 882, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000243663787842 + }, + { + "episode": 14144, + "epoch": 0.25423302297156414, + "loss/policy_avg": -0.07051656395196915, + "lr": 2.830713190184049e-06, + "objective/entropy": 99.65924072265625, + "objective/kl": 11.526399612426758, + "objective/non_score_reward": -1.1526398658752441, + "objective/rlhf_reward": -4.2105597615242, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.548530101776123, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6270474195480347, + "step": 883, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0014915466308594 + }, + { + "episode": 14160, + "epoch": 0.2545206168889528, + "loss/policy_avg": 0.15961593389511108, + "lr": 2.830521472392638e-06, + "objective/entropy": -13.243667602539062, + "objective/kl": 14.339917182922363, + "objective/non_score_reward": -1.43399178981781, + "objective/rlhf_reward": -1.3359672188758847, + "objective/scores": 1.1, + "policy/approxkl_avg": 100.59516906738281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.570610523223877, + "step": 884, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982887506484985 + }, + { + "episode": 14176, + "epoch": 0.2548082108063415, + "loss/policy_avg": 0.2716251015663147, + "lr": 2.830329754601227e-06, + "objective/entropy": 43.207088470458984, + "objective/kl": 15.224469184875488, + "objective/non_score_reward": -1.522447109222412, + "objective/rlhf_reward": -1.6897883176803585, + "objective/scores": 1.1, + "policy/approxkl_avg": 12.94150161743164, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.45153629779815674, + "step": 885, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001307725906372 + }, + { + "episode": 14192, + "epoch": 0.2550958047237301, + "loss/policy_avg": 0.19296292960643768, + "lr": 2.830138036809816e-06, + "objective/entropy": -110.1358642578125, + "objective/kl": 8.883868217468262, + "objective/non_score_reward": -0.8883869051933289, + "objective/rlhf_reward": -5.553547382354736, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.389582633972168, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.713127613067627, + "step": 886, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0009400844573975 + }, + { + "episode": 14208, + "epoch": 0.25538339864111875, + "loss/policy_avg": -0.08597195148468018, + "lr": 2.829946319018405e-06, + "objective/entropy": 135.94149780273438, + "objective/kl": 12.527217864990234, + "objective/non_score_reward": -1.2527216672897339, + "objective/rlhf_reward": -0.6108868777751919, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.5981717109680176, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6131302714347839, + "step": 887, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0009326934814453 + }, + { + "episode": 14224, + "epoch": 0.2556709925585074, + "loss/policy_avg": 0.06909796595573425, + "lr": 2.8297546012269938e-06, + "objective/entropy": 86.01097869873047, + "objective/kl": 10.155022621154785, + "objective/non_score_reward": -1.0155022144317627, + "objective/rlhf_reward": -1.1382899030458655, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 16.276836395263672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6549547910690308, + "step": 888, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995278120040894 + }, + { + "episode": 14240, + "epoch": 0.25595858647589603, + "loss/policy_avg": 0.1944853961467743, + "lr": 2.829562883435583e-06, + "objective/entropy": 23.008251190185547, + "objective/kl": 10.702659606933594, + "objective/non_score_reward": -1.0702658891677856, + "objective/rlhf_reward": -3.88106365352869, + "objective/scores": 0.1, + "policy/approxkl_avg": 92.89102935791016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.557334840297699, + "step": 889, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0014572143554688 + }, + { + "episode": 14256, + "epoch": 0.2562461803932847, + "loss/policy_avg": 0.18296848237514496, + "lr": 2.829371165644172e-06, + "objective/entropy": 236.90658569335938, + "objective/kl": 17.1822566986084, + "objective/non_score_reward": -1.7182257175445557, + "objective/rlhf_reward": -6.472903227806091, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.678916931152344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6069942116737366, + "step": 890, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001883506774902 + }, + { + "episode": 14272, + "epoch": 0.2565337743106733, + "loss/policy_avg": 0.04921949282288551, + "lr": 2.829179447852761e-06, + "objective/entropy": 108.23074340820312, + "objective/kl": 9.07606315612793, + "objective/non_score_reward": -0.907606303691864, + "objective/rlhf_reward": 0.7695747852325443, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.1709768772125244, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6781247854232788, + "step": 891, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007190704345703 + }, + { + "episode": 14288, + "epoch": 0.25682136822806195, + "loss/policy_avg": 0.263433575630188, + "lr": 2.82898773006135e-06, + "objective/entropy": -11.065872192382812, + "objective/kl": 15.533244132995605, + "objective/non_score_reward": -1.5533244609832764, + "objective/rlhf_reward": -4.090591879860435, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 90.50593566894531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6565124988555908, + "step": 892, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996107816696167 + }, + { + "episode": 14304, + "epoch": 0.25710896214545065, + "loss/policy_avg": 0.03790378198027611, + "lr": 2.8287960122699387e-06, + "objective/entropy": -195.91549682617188, + "objective/kl": 4.9871110916137695, + "objective/non_score_reward": -0.49871110916137695, + "objective/rlhf_reward": -0.047433326916630936, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.2991867065429688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5137837529182434, + "step": 893, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9998875856399536 + }, + { + "episode": 14320, + "epoch": 0.2573965560628393, + "loss/policy_avg": 0.16121526062488556, + "lr": 2.828604294478528e-06, + "objective/entropy": -126.01262664794922, + "objective/kl": 13.482461929321289, + "objective/non_score_reward": -1.3482462167739868, + "objective/rlhf_reward": -3.5681560590592136, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 6.479033470153809, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5413044691085815, + "step": 894, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979188442230225 + }, + { + "episode": 14336, + "epoch": 0.25768414998022793, + "loss/policy_avg": 0.03230078145861626, + "lr": 2.8284125766871167e-06, + "objective/entropy": -3.0018844604492188, + "objective/kl": 11.91611385345459, + "objective/non_score_reward": -1.191611409187317, + "objective/rlhf_reward": -4.366445696353912, + "objective/scores": 0.1, + "policy/approxkl_avg": 78.21747589111328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6705281734466553, + "step": 895, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984080791473389 + }, + { + "episode": 14352, + "epoch": 0.25797174389761657, + "loss/policy_avg": 0.3060767948627472, + "lr": 2.8282208588957055e-06, + "objective/entropy": 61.28870391845703, + "objective/kl": 8.302905082702637, + "objective/non_score_reward": -0.8302905559539795, + "objective/rlhf_reward": -2.921162268519401, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.191162586212158, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.553315281867981, + "step": 896, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997650384902954 + }, + { + "episode": 14368, + "epoch": 0.2582593378150052, + "loss/policy_avg": 0.17983271181583405, + "lr": 2.8280291411042947e-06, + "objective/entropy": -20.331504821777344, + "objective/kl": 17.491222381591797, + "objective/non_score_reward": -1.7491222620010376, + "objective/rlhf_reward": -8.996489524841309, + "objective/scores": -0.5, + "policy/approxkl_avg": 119.10844421386719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5886894464492798, + "step": 897, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989356994628906 + }, + { + "episode": 14384, + "epoch": 0.25854693173239385, + "loss/policy_avg": 0.1394512951374054, + "lr": 2.8278374233128836e-06, + "objective/entropy": 127.6997299194336, + "objective/kl": 15.690263748168945, + "objective/non_score_reward": -1.5690264701843262, + "objective/rlhf_reward": -8.276105880737305, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.133121490478516, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6130183935165405, + "step": 898, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997514247894287 + }, + { + "episode": 14400, + "epoch": 0.2588345256497825, + "loss/policy_avg": 0.2098160684108734, + "lr": 2.827645705521473e-06, + "objective/entropy": -21.335540771484375, + "objective/kl": 10.539255142211914, + "objective/non_score_reward": -1.0539255142211914, + "objective/rlhf_reward": -3.815702205896377, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.069839477539062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7244166731834412, + "step": 899, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9994945526123047 + }, + { + "episode": 14416, + "epoch": 0.2591221195671711, + "loss/policy_avg": 0.642552375793457, + "lr": 2.8274539877300616e-06, + "objective/entropy": -27.118850708007812, + "objective/kl": 15.130813598632812, + "objective/non_score_reward": -1.5130811929702759, + "objective/rlhf_reward": -3.652324831485748, + "objective/scores": 0.6, + "policy/approxkl_avg": 107.5416259765625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6859394311904907, + "step": 900, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.00144100189209 + }, + { + "episode": 14432, + "epoch": 0.2594097134845598, + "loss/policy_avg": 0.5634386539459229, + "lr": 2.8272622699386504e-06, + "objective/entropy": -81.20954895019531, + "objective/kl": 9.424870491027832, + "objective/non_score_reward": -0.9424870610237122, + "objective/rlhf_reward": -1.647241981998954, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 161.22662353515625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49134987592697144, + "step": 901, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995436668395996 + }, + { + "episode": 14448, + "epoch": 0.25969730740194846, + "loss/policy_avg": 0.12854906916618347, + "lr": 2.8270705521472392e-06, + "objective/entropy": 49.876365661621094, + "objective/kl": 13.830504417419434, + "objective/non_score_reward": -1.383050560951233, + "objective/rlhf_reward": -7.532202243804932, + "objective/scores": -0.5, + "policy/approxkl_avg": 109.16975402832031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6257975697517395, + "step": 902, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994562864303589 + }, + { + "episode": 14464, + "epoch": 0.2599849013193371, + "loss/policy_avg": -0.2561631202697754, + "lr": 2.826878834355828e-06, + "objective/entropy": -208.84307861328125, + "objective/kl": 16.52663803100586, + "objective/non_score_reward": -1.6526635885238647, + "objective/rlhf_reward": -4.210654458403587, + "objective/scores": 0.6, + "policy/approxkl_avg": 27.665565490722656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7057524919509888, + "step": 903, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998441457748413 + }, + { + "episode": 14480, + "epoch": 0.26027249523672574, + "loss/policy_avg": 0.011652922257781029, + "lr": 2.8266871165644173e-06, + "objective/entropy": 39.806941986083984, + "objective/kl": 8.505756378173828, + "objective/non_score_reward": -0.8505756855010986, + "objective/rlhf_reward": -3.0023026227951046, + "objective/scores": 0.1, + "policy/approxkl_avg": 97.56808471679688, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49984896183013916, + "step": 904, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997618198394775 + }, + { + "episode": 14496, + "epoch": 0.2605600891541144, + "loss/policy_avg": 0.3299306035041809, + "lr": 2.826495398773006e-06, + "objective/entropy": 186.02059936523438, + "objective/kl": 12.209084510803223, + "objective/non_score_reward": -1.2209084033966064, + "objective/rlhf_reward": -2.760927232281242, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 42.59535217285156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8104973435401917, + "step": 905, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0011796951293945 + }, + { + "episode": 14512, + "epoch": 0.260847683071503, + "loss/policy_avg": 0.4473887085914612, + "lr": 2.826303680981595e-06, + "objective/entropy": 236.0746307373047, + "objective/kl": 15.661829948425293, + "objective/non_score_reward": -1.5661829710006714, + "objective/rlhf_reward": -8.264732360839844, + "objective/scores": -0.5, + "policy/approxkl_avg": 174.1883087158203, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6012299656867981, + "step": 906, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999688982963562 + }, + { + "episode": 14528, + "epoch": 0.26113527698889166, + "loss/policy_avg": 0.26635417342185974, + "lr": 2.826111963190184e-06, + "objective/entropy": -117.22915649414062, + "objective/kl": 19.672523498535156, + "objective/non_score_reward": -1.967252492904663, + "objective/rlhf_reward": -9.869009971618652, + "objective/scores": -0.5, + "policy/approxkl_avg": 142.4681396484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6905121803283691, + "step": 907, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998607873916626 + }, + { + "episode": 14544, + "epoch": 0.26142287090628036, + "loss/policy_avg": 0.463941752910614, + "lr": 2.825920245398773e-06, + "objective/entropy": 193.2610626220703, + "objective/kl": 11.647161483764648, + "objective/non_score_reward": -1.1647162437438965, + "objective/rlhf_reward": -2.2588648259639736, + "objective/scores": 0.6, + "policy/approxkl_avg": 40.81477355957031, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6915194988250732, + "step": 908, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976000785827637 + }, + { + "episode": 14560, + "epoch": 0.261710464823669, + "loss/policy_avg": 0.4663980007171631, + "lr": 2.825728527607362e-06, + "objective/entropy": -9.778663635253906, + "objective/kl": 11.958295822143555, + "objective/non_score_reward": -1.1958296298980713, + "objective/rlhf_reward": -1.8595997437250344, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 25.867328643798828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.38230326771736145, + "step": 909, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996697902679443 + }, + { + "episode": 14576, + "epoch": 0.26199805874105764, + "loss/policy_avg": 0.21209433674812317, + "lr": 2.825536809815951e-06, + "objective/entropy": 162.04637145996094, + "objective/kl": 15.277240753173828, + "objective/non_score_reward": -1.527724266052246, + "objective/rlhf_reward": -5.710896825790405, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.56328010559082, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7583177089691162, + "step": 910, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000117063522339 + }, + { + "episode": 14592, + "epoch": 0.2622856526584463, + "loss/policy_avg": 0.6738499402999878, + "lr": 2.8253450920245398e-06, + "objective/entropy": 169.78570556640625, + "objective/kl": 12.763500213623047, + "objective/non_score_reward": -1.2763500213623047, + "objective/rlhf_reward": -4.705400294065475, + "objective/scores": 0.1, + "policy/approxkl_avg": 50.89369201660156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.712113082408905, + "step": 911, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986307621002197 + }, + { + "episode": 14608, + "epoch": 0.2625732465758349, + "loss/policy_avg": -0.1670105904340744, + "lr": 2.825153374233129e-06, + "objective/entropy": 129.38778686523438, + "objective/kl": 8.210895538330078, + "objective/non_score_reward": -0.8210896253585815, + "objective/rlhf_reward": -5.284358501434326, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.988738536834717, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5074647665023804, + "step": 912, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003474473953247 + }, + { + "episode": 14624, + "epoch": 0.26286084049322356, + "loss/policy_avg": 0.8385502099990845, + "lr": 2.824961656441718e-06, + "objective/entropy": 2.270915985107422, + "objective/kl": 16.01020622253418, + "objective/non_score_reward": -1.6010206937789917, + "objective/rlhf_reward": -6.004082834720611, + "objective/scores": 0.1, + "policy/approxkl_avg": 122.61514282226562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6589502692222595, + "step": 913, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0001535415649414 + }, + { + "episode": 14640, + "epoch": 0.2631484344106122, + "loss/policy_avg": 0.5200643539428711, + "lr": 2.824769938650307e-06, + "objective/entropy": 41.12443542480469, + "objective/kl": 14.1666259765625, + "objective/non_score_reward": -1.4166628122329712, + "objective/rlhf_reward": -1.2666512787342068, + "objective/scores": 1.1, + "policy/approxkl_avg": 110.20585632324219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7196406126022339, + "step": 914, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971590042114258 + }, + { + "episode": 14656, + "epoch": 0.26343602832800084, + "loss/policy_avg": 0.056996263563632965, + "lr": 2.824578220858896e-06, + "objective/entropy": 147.31983947753906, + "objective/kl": 7.540309906005859, + "objective/non_score_reward": -0.7540310025215149, + "objective/rlhf_reward": -5.0161237716674805, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.796720504760742, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5611803531646729, + "step": 915, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9968600273132324 + }, + { + "episode": 14672, + "epoch": 0.26372362224538953, + "loss/policy_avg": 0.24829980731010437, + "lr": 2.8243865030674847e-06, + "objective/entropy": 31.06182861328125, + "objective/kl": 10.875221252441406, + "objective/non_score_reward": -1.0875221490859985, + "objective/rlhf_reward": -2.227382304445777, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.782899379730225, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.635535717010498, + "step": 916, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9965479373931885 + }, + { + "episode": 14688, + "epoch": 0.2640112161627782, + "loss/policy_avg": 0.06934709846973419, + "lr": 2.824194785276074e-06, + "objective/entropy": 142.6950225830078, + "objective/kl": 6.470335960388184, + "objective/non_score_reward": -0.6470335721969604, + "objective/rlhf_reward": 0.3355845019805703, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.148372650146484, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6092413663864136, + "step": 917, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998875617980957 + }, + { + "episode": 14704, + "epoch": 0.2642988100801668, + "loss/policy_avg": 0.20597806572914124, + "lr": 2.8240030674846627e-06, + "objective/entropy": 272.4873046875, + "objective/kl": 14.78536319732666, + "objective/non_score_reward": -1.4785361289978027, + "objective/rlhf_reward": -7.914144992828369, + "objective/scores": -0.5, + "policy/approxkl_avg": 106.7894287109375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7212300300598145, + "step": 918, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0013232231140137 + }, + { + "episode": 14720, + "epoch": 0.26458640399755545, + "loss/policy_avg": -0.0837036743760109, + "lr": 2.8238113496932515e-06, + "objective/entropy": 48.787288665771484, + "objective/kl": 13.830099105834961, + "objective/non_score_reward": -1.383009910583496, + "objective/rlhf_reward": -7.532039165496826, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.889322280883789, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6313380599021912, + "step": 919, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004570484161377 + }, + { + "episode": 14736, + "epoch": 0.2648739979149441, + "loss/policy_avg": 0.5091476440429688, + "lr": 2.8236196319018408e-06, + "objective/entropy": 169.26397705078125, + "objective/kl": 15.42192554473877, + "objective/non_score_reward": -1.5421926975250244, + "objective/rlhf_reward": -5.768770357966423, + "objective/scores": 0.1, + "policy/approxkl_avg": 256.7361145019531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6787434816360474, + "step": 920, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991068840026855 + }, + { + "episode": 14752, + "epoch": 0.26516159183233273, + "loss/policy_avg": 0.153235524892807, + "lr": 2.8234279141104296e-06, + "objective/entropy": -151.74813842773438, + "objective/kl": 8.891946792602539, + "objective/non_score_reward": -0.8891947269439697, + "objective/rlhf_reward": 0.8432209208607677, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.438941955566406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6721331477165222, + "step": 921, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993393421173096 + }, + { + "episode": 14768, + "epoch": 0.2654491857497214, + "loss/policy_avg": 0.1505753993988037, + "lr": 2.823236196319019e-06, + "objective/entropy": 103.85417175292969, + "objective/kl": 9.798246383666992, + "objective/non_score_reward": -0.9798246026039124, + "objective/rlhf_reward": -2.363039045539453, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 3.5156641006469727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5437833070755005, + "step": 922, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987695217132568 + }, + { + "episode": 14784, + "epoch": 0.26573677966711007, + "loss/policy_avg": 0.031007010489702225, + "lr": 2.823044478527607e-06, + "objective/entropy": -236.375732421875, + "objective/kl": 14.377126693725586, + "objective/non_score_reward": -1.4377126693725586, + "objective/rlhf_reward": -5.350850439071655, + "objective/scores": 0.1, + "policy/approxkl_avg": 107.49105072021484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7668004631996155, + "step": 923, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992296695709229 + }, + { + "episode": 14800, + "epoch": 0.2660243735844987, + "loss/policy_avg": 1.054640769958496, + "lr": 2.8228527607361964e-06, + "objective/entropy": 308.00750732421875, + "objective/kl": 16.157392501831055, + "objective/non_score_reward": -1.615739345550537, + "objective/rlhf_reward": -3.539238218904707, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 54.42896270751953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7293402552604675, + "step": 924, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982367753982544 + }, + { + "episode": 14816, + "epoch": 0.26631196750188735, + "loss/policy_avg": 0.8975625038146973, + "lr": 2.8226610429447852e-06, + "objective/entropy": 24.863666534423828, + "objective/kl": 9.843988418579102, + "objective/non_score_reward": -0.9843988418579102, + "objective/rlhf_reward": -3.5375953972339627, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.406509399414062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.34417903423309326, + "step": 925, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979350566864014 + }, + { + "episode": 14832, + "epoch": 0.266599561419276, + "loss/policy_avg": 0.7326955795288086, + "lr": 2.822469325153374e-06, + "objective/entropy": -177.4724578857422, + "objective/kl": 18.637279510498047, + "objective/non_score_reward": -1.8637280464172363, + "objective/rlhf_reward": -5.850791875187474, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 165.49850463867188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6023860573768616, + "step": 926, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989757537841797 + }, + { + "episode": 14848, + "epoch": 0.2668871553366646, + "loss/policy_avg": 0.18156574666500092, + "lr": 2.8222776073619633e-06, + "objective/entropy": 98.12123107910156, + "objective/kl": 16.134002685546875, + "objective/non_score_reward": -1.6134004592895508, + "objective/rlhf_reward": -8.453601837158203, + "objective/scores": -0.5, + "policy/approxkl_avg": 92.71282958984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4930706322193146, + "step": 927, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999679446220398 + }, + { + "episode": 14864, + "epoch": 0.26717474925405327, + "loss/policy_avg": 0.6215415596961975, + "lr": 2.822085889570552e-06, + "objective/entropy": -99.99117279052734, + "objective/kl": 6.985932350158691, + "objective/non_score_reward": -0.6985931396484375, + "objective/rlhf_reward": 1.6056274041533474, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.1479220390319824, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5162912011146545, + "step": 928, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0008466243743896 + }, + { + "episode": 14880, + "epoch": 0.2674623431714419, + "loss/policy_avg": 0.305203914642334, + "lr": 2.8218941717791413e-06, + "objective/entropy": 52.752532958984375, + "objective/kl": 16.012250900268555, + "objective/non_score_reward": -1.6012248992919922, + "objective/rlhf_reward": -6.0048998355865475, + "objective/scores": 0.1, + "policy/approxkl_avg": 33.087364196777344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8244567513465881, + "step": 929, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9968031644821167 + }, + { + "episode": 14896, + "epoch": 0.26774993708883055, + "loss/policy_avg": 0.3882259726524353, + "lr": 2.82170245398773e-06, + "objective/entropy": 170.99806213378906, + "objective/kl": 11.023755073547363, + "objective/non_score_reward": -1.1023752689361572, + "objective/rlhf_reward": -0.009501358866691234, + "objective/scores": 1.1, + "policy/approxkl_avg": 47.82780456542969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.68819260597229, + "step": 930, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000180721282959 + }, + { + "episode": 14912, + "epoch": 0.26803753100621924, + "loss/policy_avg": 0.3398761451244354, + "lr": 2.821510736196319e-06, + "objective/entropy": -18.394920349121094, + "objective/kl": 10.919998168945312, + "objective/non_score_reward": -1.0919996500015259, + "objective/rlhf_reward": -1.9679986894130705, + "objective/scores": 0.6, + "policy/approxkl_avg": 50.97767639160156, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.736298680305481, + "step": 931, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992265701293945 + }, + { + "episode": 14928, + "epoch": 0.2683251249236079, + "loss/policy_avg": -0.4279845058917999, + "lr": 2.821319018404908e-06, + "objective/entropy": 86.6572494506836, + "objective/kl": 7.83455228805542, + "objective/non_score_reward": -0.7834553718566895, + "objective/rlhf_reward": -1.4719620398884876, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 18.167985916137695, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.49258583784103394, + "step": 932, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.011922836303711 + }, + { + "episode": 14944, + "epoch": 0.2686127188409965, + "loss/policy_avg": 0.23957113921642303, + "lr": 2.821127300613497e-06, + "objective/entropy": 121.09419250488281, + "objective/kl": 11.997109413146973, + "objective/non_score_reward": -1.1997110843658447, + "objective/rlhf_reward": -6.798844337463379, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.186599731445312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5409206748008728, + "step": 933, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.995774269104004 + }, + { + "episode": 14960, + "epoch": 0.26890031275838516, + "loss/policy_avg": 0.44504815340042114, + "lr": 2.8209355828220858e-06, + "objective/entropy": 85.28959655761719, + "objective/kl": 9.80569076538086, + "objective/non_score_reward": -0.9805691242218018, + "objective/rlhf_reward": 0.47772344350814855, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.007376670837402, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6362345218658447, + "step": 934, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998863697052002 + }, + { + "episode": 14976, + "epoch": 0.2691879066757738, + "loss/policy_avg": 0.06488144397735596, + "lr": 2.820743865030675e-06, + "objective/entropy": -71.18653869628906, + "objective/kl": 11.959724426269531, + "objective/non_score_reward": -1.1959723234176636, + "objective/rlhf_reward": -6.783889293670654, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.0035400390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6700109243392944, + "step": 935, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0012855529785156 + }, + { + "episode": 14992, + "epoch": 0.26947550059316244, + "loss/policy_avg": 0.00518820621073246, + "lr": 2.820552147239264e-06, + "objective/entropy": 10.628082275390625, + "objective/kl": 19.038652420043945, + "objective/non_score_reward": -1.9038654565811157, + "objective/rlhf_reward": -4.691742573620054, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 43.91889190673828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9487606287002563, + "step": 936, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997607707977295 + }, + { + "episode": 15008, + "epoch": 0.2697630945105511, + "loss/policy_avg": 0.1454022079706192, + "lr": 2.820360429447853e-06, + "objective/entropy": 179.77040100097656, + "objective/kl": 12.530265808105469, + "objective/non_score_reward": -1.2530266046524048, + "objective/rlhf_reward": -4.612106418609619, + "objective/scores": 0.1, + "policy/approxkl_avg": 82.92576599121094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6685892343521118, + "step": 937, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975636005401611 + }, + { + "episode": 15024, + "epoch": 0.2700506884279397, + "loss/policy_avg": 0.2995688319206238, + "lr": 2.820168711656442e-06, + "objective/entropy": 133.7378692626953, + "objective/kl": 10.791794776916504, + "objective/non_score_reward": -1.0791795253753662, + "objective/rlhf_reward": -1.9167180418968202, + "objective/scores": 0.6, + "policy/approxkl_avg": 52.06595230102539, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4817456007003784, + "step": 938, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000180721282959 + }, + { + "episode": 15040, + "epoch": 0.2703382823453284, + "loss/policy_avg": 0.34660422801971436, + "lr": 2.8199769938650307e-06, + "objective/entropy": 19.058616638183594, + "objective/kl": 14.038864135742188, + "objective/non_score_reward": -1.4038866758346558, + "objective/rlhf_reward": -1.215546733140945, + "objective/scores": 1.1, + "policy/approxkl_avg": 48.796592712402344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41182541847229004, + "step": 939, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997514009475708 + }, + { + "episode": 15056, + "epoch": 0.27062587626271706, + "loss/policy_avg": 0.2968835234642029, + "lr": 2.81978527607362e-06, + "objective/entropy": 76.23985290527344, + "objective/kl": 13.028081893920898, + "objective/non_score_reward": -1.3028082847595215, + "objective/rlhf_reward": -4.811233407258987, + "objective/scores": 0.1, + "policy/approxkl_avg": 115.7067642211914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6411737203598022, + "step": 940, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997145414352417 + }, + { + "episode": 15072, + "epoch": 0.2709134701801057, + "loss/policy_avg": 0.38955211639404297, + "lr": 2.8195935582822087e-06, + "objective/entropy": -10.788116455078125, + "objective/kl": 14.27804946899414, + "objective/non_score_reward": -1.4278050661087036, + "objective/rlhf_reward": -1.3112202048301693, + "objective/scores": 1.1, + "policy/approxkl_avg": 11.869421005249023, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6856441497802734, + "step": 941, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979138374328613 + }, + { + "episode": 15088, + "epoch": 0.27120106409749434, + "loss/policy_avg": -0.04374265670776367, + "lr": 2.819401840490798e-06, + "objective/entropy": 46.83854675292969, + "objective/kl": 8.191838264465332, + "objective/non_score_reward": -0.8191839456558228, + "objective/rlhf_reward": -2.8767357528209683, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.673277854919434, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4365406632423401, + "step": 942, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992234706878662 + }, + { + "episode": 15104, + "epoch": 0.271488658014883, + "loss/policy_avg": 0.019855128601193428, + "lr": 2.8192101226993868e-06, + "objective/entropy": 146.18865966796875, + "objective/kl": 11.691083908081055, + "objective/non_score_reward": -1.169108271598816, + "objective/rlhf_reward": -4.276433093845844, + "objective/scores": 0.1, + "policy/approxkl_avg": 49.19375228881836, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.438481867313385, + "step": 943, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008602142333984 + }, + { + "episode": 15120, + "epoch": 0.2717762519322716, + "loss/policy_avg": -0.07478684931993484, + "lr": 2.8190184049079756e-06, + "objective/entropy": 203.56137084960938, + "objective/kl": 14.393468856811523, + "objective/non_score_reward": -1.4393467903137207, + "objective/rlhf_reward": -1.357387429475784, + "objective/scores": 1.1, + "policy/approxkl_avg": 32.7160758972168, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7971817255020142, + "step": 944, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00042462348938 + }, + { + "episode": 15136, + "epoch": 0.27206384584966026, + "loss/policy_avg": 0.2446022629737854, + "lr": 2.8188266871165644e-06, + "objective/entropy": 31.180438995361328, + "objective/kl": 16.74203109741211, + "objective/non_score_reward": -1.6742032766342163, + "objective/rlhf_reward": -8.696813583374023, + "objective/scores": -0.5, + "policy/approxkl_avg": 105.12753295898438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7867506742477417, + "step": 945, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9979796409606934 + }, + { + "episode": 15152, + "epoch": 0.27235143976704895, + "loss/policy_avg": 0.1629078984260559, + "lr": 2.818634969325153e-06, + "objective/entropy": -197.06622314453125, + "objective/kl": 13.794787406921387, + "objective/non_score_reward": -1.3794788122177124, + "objective/rlhf_reward": -7.51791524887085, + "objective/scores": -0.5, + "policy/approxkl_avg": 115.64588928222656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6810861825942993, + "step": 946, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9969407320022583 + }, + { + "episode": 15168, + "epoch": 0.2726390336844376, + "loss/policy_avg": -0.03024168312549591, + "lr": 2.8184432515337424e-06, + "objective/entropy": 149.4989776611328, + "objective/kl": 16.816232681274414, + "objective/non_score_reward": -1.6816232204437256, + "objective/rlhf_reward": -8.726492881774902, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.17455291748047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48948463797569275, + "step": 947, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001532554626465 + }, + { + "episode": 15184, + "epoch": 0.27292662760182623, + "loss/policy_avg": 0.34567373991012573, + "lr": 2.8182515337423312e-06, + "objective/entropy": 90.28936767578125, + "objective/kl": 19.398473739624023, + "objective/non_score_reward": -1.939847469329834, + "objective/rlhf_reward": -7.359389877319336, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.141029357910156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7335373163223267, + "step": 948, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995067119598389 + }, + { + "episode": 15200, + "epoch": 0.2732142215192149, + "loss/policy_avg": 0.4489978551864624, + "lr": 2.81805981595092e-06, + "objective/entropy": 38.101829528808594, + "objective/kl": 14.68608283996582, + "objective/non_score_reward": -1.4686082601547241, + "objective/rlhf_reward": -7.8744330406188965, + "objective/scores": -0.5, + "policy/approxkl_avg": 115.49784088134766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5973610877990723, + "step": 949, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989502429962158 + }, + { + "episode": 15216, + "epoch": 0.2735018154366035, + "loss/policy_avg": 0.4765210449695587, + "lr": 2.8178680981595093e-06, + "objective/entropy": 234.66183471679688, + "objective/kl": 17.86334228515625, + "objective/non_score_reward": -1.7863342761993408, + "objective/rlhf_reward": -6.745336925983429, + "objective/scores": 0.1, + "policy/approxkl_avg": 46.356414794921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7054615616798401, + "step": 950, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985949993133545 + }, + { + "episode": 15232, + "epoch": 0.27378940935399215, + "loss/policy_avg": 0.0288299061357975, + "lr": 2.817676380368098e-06, + "objective/entropy": 156.8824462890625, + "objective/kl": 9.917464256286621, + "objective/non_score_reward": -0.9917463660240173, + "objective/rlhf_reward": -5.966985702514648, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.51423645019531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5643774271011353, + "step": 951, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000211238861084 + }, + { + "episode": 15248, + "epoch": 0.2740770032713808, + "loss/policy_avg": 0.11630159616470337, + "lr": 2.8174846625766873e-06, + "objective/entropy": 73.06771087646484, + "objective/kl": 13.975484848022461, + "objective/non_score_reward": -1.3975484371185303, + "objective/rlhf_reward": -5.190193688869476, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.41292190551758, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5209547281265259, + "step": 952, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003228187561035 + }, + { + "episode": 15264, + "epoch": 0.27436459718876943, + "loss/policy_avg": 0.4185836911201477, + "lr": 2.817292944785276e-06, + "objective/entropy": 172.16983032226562, + "objective/kl": 18.557403564453125, + "objective/non_score_reward": -1.8557404279708862, + "objective/rlhf_reward": -7.022961831092834, + "objective/scores": 0.1, + "policy/approxkl_avg": 90.51202392578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.751570463180542, + "step": 953, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9965605735778809 + }, + { + "episode": 15280, + "epoch": 0.27465219110615813, + "loss/policy_avg": -0.0021925121545791626, + "lr": 2.817101226993865e-06, + "objective/entropy": -46.05889892578125, + "objective/kl": 13.408881187438965, + "objective/non_score_reward": -1.3408881425857544, + "objective/rlhf_reward": -2.439833794475767, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 17.623706817626953, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7430280447006226, + "step": 954, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0016512870788574 + }, + { + "episode": 15296, + "epoch": 0.27493978502354677, + "loss/policy_avg": -0.014723315834999084, + "lr": 2.816909509202454e-06, + "objective/entropy": 288.378173828125, + "objective/kl": 11.871005058288574, + "objective/non_score_reward": -1.1871004104614258, + "objective/rlhf_reward": -6.748402118682861, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.3176779747009277, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6996443271636963, + "step": 955, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000519037246704 + }, + { + "episode": 15312, + "epoch": 0.2752273789409354, + "loss/policy_avg": 0.6079083681106567, + "lr": 2.816717791411043e-06, + "objective/entropy": 31.44274139404297, + "objective/kl": 20.894535064697266, + "objective/non_score_reward": -2.08945369720459, + "objective/rlhf_reward": -7.957814311981202, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.899011611938477, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3588119149208069, + "step": 956, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980523586273193 + }, + { + "episode": 15328, + "epoch": 0.27551497285832405, + "loss/policy_avg": 0.41200828552246094, + "lr": 2.816526073619632e-06, + "objective/entropy": -4.771537780761719, + "objective/kl": 15.709589004516602, + "objective/non_score_reward": -1.5709590911865234, + "objective/rlhf_reward": -4.161129685417686, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 90.92428588867188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5645814538002014, + "step": 957, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.995368242263794 + }, + { + "episode": 15344, + "epoch": 0.2758025667757127, + "loss/policy_avg": 0.45684581995010376, + "lr": 2.816334355828221e-06, + "objective/entropy": -4.6809844970703125, + "objective/kl": 9.97734260559082, + "objective/non_score_reward": -0.9977341890335083, + "objective/rlhf_reward": -3.5909366667270657, + "objective/scores": 0.1, + "policy/approxkl_avg": 56.411354064941406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6887657642364502, + "step": 958, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.002199172973633 + }, + { + "episode": 15360, + "epoch": 0.2760901606931013, + "loss/policy_avg": -0.45410293340682983, + "lr": 2.81614263803681e-06, + "objective/entropy": -121.51223754882812, + "objective/kl": 7.737212657928467, + "objective/non_score_reward": -0.7737212777137756, + "objective/rlhf_reward": -1.2700562730160465, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 17.588157653808594, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4520346224308014, + "step": 959, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.006819725036621 + }, + { + "episode": 15376, + "epoch": 0.27637775461048997, + "loss/policy_avg": 0.30216163396835327, + "lr": 2.815950920245399e-06, + "objective/entropy": 151.243896484375, + "objective/kl": 17.64258575439453, + "objective/non_score_reward": -1.764258623123169, + "objective/rlhf_reward": -4.133315478206846, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 200.840087890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5193691253662109, + "step": 960, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9965643882751465 + }, + { + "episode": 15392, + "epoch": 0.27666534852787866, + "loss/policy_avg": 0.04060244560241699, + "lr": 2.815759202453988e-06, + "objective/entropy": 231.11839294433594, + "objective/kl": 14.362661361694336, + "objective/non_score_reward": -1.4362661838531494, + "objective/rlhf_reward": -5.345064407587051, + "objective/scores": 0.1, + "policy/approxkl_avg": 47.88323974609375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7988565564155579, + "step": 961, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993538856506348 + }, + { + "episode": 15408, + "epoch": 0.2769529424452673, + "loss/policy_avg": 0.3130953907966614, + "lr": 2.8155674846625767e-06, + "objective/entropy": 120.5262680053711, + "objective/kl": 15.489297866821289, + "objective/non_score_reward": -1.5489299297332764, + "objective/rlhf_reward": -5.795719510316848, + "objective/scores": 0.1, + "policy/approxkl_avg": 145.9391326904297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6065146923065186, + "step": 962, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990172386169434 + }, + { + "episode": 15424, + "epoch": 0.27724053636265594, + "loss/policy_avg": 0.4497438967227936, + "lr": 2.815375766871166e-06, + "objective/entropy": 103.47833251953125, + "objective/kl": 18.042163848876953, + "objective/non_score_reward": -1.8042165040969849, + "objective/rlhf_reward": -9.216865539550781, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.347259521484375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5049797296524048, + "step": 963, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0005576610565186 + }, + { + "episode": 15440, + "epoch": 0.2775281302800446, + "loss/policy_avg": 0.35638368129730225, + "lr": 2.8151840490797547e-06, + "objective/entropy": 91.05720520019531, + "objective/kl": 11.770478248596191, + "objective/non_score_reward": -1.177047848701477, + "objective/rlhf_reward": -1.7844724997293677, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 71.71107482910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8685045838356018, + "step": 964, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999760627746582 + }, + { + "episode": 15456, + "epoch": 0.2778157241974332, + "loss/policy_avg": 0.463620662689209, + "lr": 2.814992331288344e-06, + "objective/entropy": 75.50821685791016, + "objective/kl": 18.123964309692383, + "objective/non_score_reward": -1.8123962879180908, + "objective/rlhf_reward": -5.693326204028681, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 125.98820495605469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6453250050544739, + "step": 965, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984214305877686 + }, + { + "episode": 15472, + "epoch": 0.27810331811482186, + "loss/policy_avg": 0.29372507333755493, + "lr": 2.8148006134969328e-06, + "objective/entropy": 249.2400360107422, + "objective/kl": 21.19771385192871, + "objective/non_score_reward": -2.1197714805603027, + "objective/rlhf_reward": -6.079085892438888, + "objective/scores": 0.6, + "policy/approxkl_avg": 248.2655487060547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8577245473861694, + "step": 966, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999601125717163 + }, + { + "episode": 15488, + "epoch": 0.2783909120322105, + "loss/policy_avg": 0.06916552782058716, + "lr": 2.8146088957055216e-06, + "objective/entropy": -68.20138549804688, + "objective/kl": 7.964428901672363, + "objective/non_score_reward": -0.7964429259300232, + "objective/rlhf_reward": -0.7857717037200929, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.7321834564208984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.44984665513038635, + "step": 967, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999240756034851 + }, + { + "episode": 15504, + "epoch": 0.27867850594959914, + "loss/policy_avg": 0.2512040436267853, + "lr": 2.8144171779141104e-06, + "objective/entropy": 11.576576232910156, + "objective/kl": 7.048787593841553, + "objective/non_score_reward": -0.7048788070678711, + "objective/rlhf_reward": -0.9946863904324283, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 0.4532914459705353, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5414634943008423, + "step": 968, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003035545349121 + }, + { + "episode": 15520, + "epoch": 0.27896609986698784, + "loss/policy_avg": 0.6474967002868652, + "lr": 2.814225460122699e-06, + "objective/entropy": -189.89291381835938, + "objective/kl": 6.612698078155518, + "objective/non_score_reward": -0.6612698435783386, + "objective/rlhf_reward": -0.24507925510406492, + "objective/scores": 0.6, + "policy/approxkl_avg": 31.995384216308594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7192500233650208, + "step": 969, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001279830932617 + }, + { + "episode": 15536, + "epoch": 0.2792536937843765, + "loss/policy_avg": 0.03323252499103546, + "lr": 2.8140337423312884e-06, + "objective/entropy": -8.374664306640625, + "objective/kl": 7.630832672119141, + "objective/non_score_reward": -0.7630833387374878, + "objective/rlhf_reward": -0.12861440026876592, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.505527019500732, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7015961408615112, + "step": 970, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000675678253174 + }, + { + "episode": 15552, + "epoch": 0.2795412877017651, + "loss/policy_avg": 0.21652743220329285, + "lr": 2.8138420245398772e-06, + "objective/entropy": 166.86929321289062, + "objective/kl": 10.748126983642578, + "objective/non_score_reward": -1.0748127698898315, + "objective/rlhf_reward": -3.899251019954681, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.194313049316406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4750038683414459, + "step": 971, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973280429840088 + }, + { + "episode": 15568, + "epoch": 0.27982888161915376, + "loss/policy_avg": 0.49266356229782104, + "lr": 2.813650306748466e-06, + "objective/entropy": 118.69342041015625, + "objective/kl": 12.942707061767578, + "objective/non_score_reward": -1.2942707538604736, + "objective/rlhf_reward": -3.5152232996827233, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 94.40614318847656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7927144169807434, + "step": 972, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002317428588867 + }, + { + "episode": 15584, + "epoch": 0.2801164755365424, + "loss/policy_avg": -0.02213054895401001, + "lr": 2.8134585889570553e-06, + "objective/entropy": 187.91323852539062, + "objective/kl": 18.41681671142578, + "objective/non_score_reward": -1.841681718826294, + "objective/rlhf_reward": -9.366726875305176, + "objective/scores": -0.5, + "policy/approxkl_avg": 153.68333435058594, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7450737953186035, + "step": 973, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002140760421753 + }, + { + "episode": 15600, + "epoch": 0.28040406945393104, + "loss/policy_avg": -0.016699761152267456, + "lr": 2.813266871165644e-06, + "objective/entropy": 20.254638671875, + "objective/kl": 5.66358757019043, + "objective/non_score_reward": -0.566358745098114, + "objective/rlhf_reward": -4.265435218811035, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.735843658447266, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.626510739326477, + "step": 974, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.003321886062622 + }, + { + "episode": 15616, + "epoch": 0.2806916633713197, + "loss/policy_avg": 0.5562024712562561, + "lr": 2.8130751533742333e-06, + "objective/entropy": 83.20606994628906, + "objective/kl": 12.872986793518066, + "objective/non_score_reward": -1.2872986793518066, + "objective/rlhf_reward": -7.149194717407227, + "objective/scores": -0.5, + "policy/approxkl_avg": 62.93476486206055, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6668212413787842, + "step": 975, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978232383728027 + }, + { + "episode": 15632, + "epoch": 0.2809792572887083, + "loss/policy_avg": 0.11060173809528351, + "lr": 2.812883435582822e-06, + "objective/entropy": -130.44863891601562, + "objective/kl": 12.669355392456055, + "objective/non_score_reward": -1.2669358253479004, + "objective/rlhf_reward": -2.1440238698732585, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 30.233688354492188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5940582752227783, + "step": 976, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00014328956604 + }, + { + "episode": 15648, + "epoch": 0.281266851206097, + "loss/policy_avg": 0.1823975145816803, + "lr": 2.812691717791411e-06, + "objective/entropy": 94.65828704833984, + "objective/kl": 12.634793281555176, + "objective/non_score_reward": -1.2634793519973755, + "objective/rlhf_reward": -2.130198453308317, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 86.40443420410156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5936030149459839, + "step": 977, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006115436553955 + }, + { + "episode": 15664, + "epoch": 0.28155444512348565, + "loss/policy_avg": 0.2329886257648468, + "lr": 2.8125e-06, + "objective/entropy": 214.52359008789062, + "objective/kl": 14.259061813354492, + "objective/non_score_reward": -1.4259061813354492, + "objective/rlhf_reward": -1.3036247253417965, + "objective/scores": 1.1, + "policy/approxkl_avg": 51.973575592041016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7015668153762817, + "step": 978, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960739612579346 + }, + { + "episode": 15680, + "epoch": 0.2818420390408743, + "loss/policy_avg": 0.1662026047706604, + "lr": 2.812308282208589e-06, + "objective/entropy": 140.4540252685547, + "objective/kl": 18.81513786315918, + "objective/non_score_reward": -1.881514072418213, + "objective/rlhf_reward": -5.578645418362553, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 54.464908599853516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5715263485908508, + "step": 979, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991223812103271 + }, + { + "episode": 15696, + "epoch": 0.28212963295826293, + "loss/policy_avg": 0.5212792754173279, + "lr": 2.8121165644171782e-06, + "objective/entropy": 271.5147705078125, + "objective/kl": 11.757984161376953, + "objective/non_score_reward": -1.1757985353469849, + "objective/rlhf_reward": -2.580488087908302, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 99.44505310058594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6800845265388489, + "step": 980, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972641468048096 + }, + { + "episode": 15712, + "epoch": 0.2824172268756516, + "loss/policy_avg": 0.10810688138008118, + "lr": 2.811924846625767e-06, + "objective/entropy": 220.07534790039062, + "objective/kl": 9.518316268920898, + "objective/non_score_reward": -0.9518316984176636, + "objective/rlhf_reward": -0.8836077495825019, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.45026969909668, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6736278533935547, + "step": 981, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0032143592834473 + }, + { + "episode": 15728, + "epoch": 0.2827048207930402, + "loss/policy_avg": 0.1366291642189026, + "lr": 2.811733128834356e-06, + "objective/entropy": -55.597900390625, + "objective/kl": 9.75802230834961, + "objective/non_score_reward": -0.9758022427558899, + "objective/rlhf_reward": -5.9032087326049805, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.722705841064453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4849565923213959, + "step": 982, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987678527832031 + }, + { + "episode": 15744, + "epoch": 0.28299241471042885, + "loss/policy_avg": 0.29598093032836914, + "lr": 2.811541411042945e-06, + "objective/entropy": 65.98103332519531, + "objective/kl": 8.494199752807617, + "objective/non_score_reward": -0.8494198322296143, + "objective/rlhf_reward": 1.002320730686188, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.425743103027344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.38723868131637573, + "step": 983, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991505146026611 + }, + { + "episode": 15760, + "epoch": 0.28328000862781755, + "loss/policy_avg": 0.44190719723701477, + "lr": 2.811349693251534e-06, + "objective/entropy": -63.0709114074707, + "objective/kl": 13.403665542602539, + "objective/non_score_reward": -1.3403666019439697, + "objective/rlhf_reward": -2.437747453094694, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 51.38234329223633, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.630331814289093, + "step": 984, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976906776428223 + }, + { + "episode": 15776, + "epoch": 0.2835676025452062, + "loss/policy_avg": 0.08249235153198242, + "lr": 2.8111579754601227e-06, + "objective/entropy": -40.74168395996094, + "objective/kl": 17.953168869018555, + "objective/non_score_reward": -1.7953169345855713, + "objective/rlhf_reward": -4.7812679469585415, + "objective/scores": 0.6, + "policy/approxkl_avg": 120.02989196777344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5399852991104126, + "step": 985, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002918243408203 + }, + { + "episode": 15792, + "epoch": 0.2838551964625948, + "loss/policy_avg": 0.2005055695772171, + "lr": 2.810966257668712e-06, + "objective/entropy": -191.1624755859375, + "objective/kl": 9.120893478393555, + "objective/non_score_reward": -0.9120894074440002, + "objective/rlhf_reward": -5.648357391357422, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.947835922241211, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.609139084815979, + "step": 986, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988343715667725 + }, + { + "episode": 15808, + "epoch": 0.28414279037998347, + "loss/policy_avg": 0.7648462057113647, + "lr": 2.8107745398773007e-06, + "objective/entropy": 359.6585693359375, + "objective/kl": 18.952003479003906, + "objective/non_score_reward": -1.895200490951538, + "objective/rlhf_reward": -5.458095910326515, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 54.210479736328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8480129837989807, + "step": 987, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999436616897583 + }, + { + "episode": 15824, + "epoch": 0.2844303842973721, + "loss/policy_avg": 0.6691151857376099, + "lr": 2.81058282208589e-06, + "objective/entropy": 73.57808685302734, + "objective/kl": 18.581069946289062, + "objective/non_score_reward": -1.858107089996338, + "objective/rlhf_reward": -5.309722246901069, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 32.08354949951172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4837074875831604, + "step": 988, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994486570358276 + }, + { + "episode": 15840, + "epoch": 0.28471797821476075, + "loss/policy_avg": 0.2605787217617035, + "lr": 2.8103911042944788e-06, + "objective/entropy": -190.95657348632812, + "objective/kl": 11.05725383758545, + "objective/non_score_reward": -1.1057254076004028, + "objective/rlhf_reward": -2.022901570796966, + "objective/scores": 0.6, + "policy/approxkl_avg": 29.785179138183594, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.74052894115448, + "step": 989, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0007266998291016 + }, + { + "episode": 15856, + "epoch": 0.2850055721321494, + "loss/policy_avg": 0.17699474096298218, + "lr": 2.8101993865030676e-06, + "objective/entropy": 202.82345581054688, + "objective/kl": 10.571136474609375, + "objective/non_score_reward": -1.0571134090423584, + "objective/rlhf_reward": -3.8284538000822064, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.342830657958984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5505605340003967, + "step": 990, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986047744750977 + }, + { + "episode": 15872, + "epoch": 0.285293166049538, + "loss/policy_avg": 0.2762322425842285, + "lr": 2.8100076687116564e-06, + "objective/entropy": -215.3885498046875, + "objective/kl": 11.097982406616211, + "objective/non_score_reward": -1.1097981929779053, + "objective/rlhf_reward": -4.039192607998848, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.58441162109375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.554857611656189, + "step": 991, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000056743621826 + }, + { + "episode": 15888, + "epoch": 0.2855807599669267, + "loss/policy_avg": 0.8083657026290894, + "lr": 2.809815950920245e-06, + "objective/entropy": 136.85971069335938, + "objective/kl": 14.077508926391602, + "objective/non_score_reward": -1.4077508449554443, + "objective/rlhf_reward": -1.2310034692287442, + "objective/scores": 1.1, + "policy/approxkl_avg": 50.88481903076172, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6447016596794128, + "step": 992, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999983310699463 + }, + { + "episode": 15904, + "epoch": 0.28586835388431536, + "loss/policy_avg": 0.09076584875583649, + "lr": 2.8096242331288344e-06, + "objective/entropy": -15.63296127319336, + "objective/kl": 17.013904571533203, + "objective/non_score_reward": -1.701390266418457, + "objective/rlhf_reward": -6.405561363697052, + "objective/scores": 0.1, + "policy/approxkl_avg": 94.94831848144531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5415375828742981, + "step": 993, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992749691009521 + }, + { + "episode": 15920, + "epoch": 0.286155947801704, + "loss/policy_avg": 0.30634552240371704, + "lr": 2.8094325153374232e-06, + "objective/entropy": 146.68658447265625, + "objective/kl": 11.60175895690918, + "objective/non_score_reward": -1.1601760387420654, + "objective/rlhf_reward": -6.640704154968262, + "objective/scores": -0.5, + "policy/approxkl_avg": 34.54106521606445, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.577959418296814, + "step": 994, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996502161026001 + }, + { + "episode": 15936, + "epoch": 0.28644354171909264, + "loss/policy_avg": -0.13943040370941162, + "lr": 2.809240797546012e-06, + "objective/entropy": 84.48971557617188, + "objective/kl": 15.685787200927734, + "objective/non_score_reward": -1.5685787200927734, + "objective/rlhf_reward": -4.449486131939004, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 35.03499221801758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4448893070220947, + "step": 995, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0012125968933105 + }, + { + "episode": 15952, + "epoch": 0.2867311356364813, + "loss/policy_avg": 0.5280898213386536, + "lr": 2.8090490797546013e-06, + "objective/entropy": 270.3984375, + "objective/kl": 17.52050018310547, + "objective/non_score_reward": -1.7520501613616943, + "objective/rlhf_reward": -6.608200347423553, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.95287322998047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6844844818115234, + "step": 996, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999497413635254 + }, + { + "episode": 15968, + "epoch": 0.2870187295538699, + "loss/policy_avg": 0.7680552005767822, + "lr": 2.80885736196319e-06, + "objective/entropy": 83.93873596191406, + "objective/kl": 15.629266738891602, + "objective/non_score_reward": -1.5629265308380127, + "objective/rlhf_reward": -5.8517063617706295, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.26701354980469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6365024447441101, + "step": 997, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980711936950684 + }, + { + "episode": 15984, + "epoch": 0.28730632347125856, + "loss/policy_avg": 0.031346116214990616, + "lr": 2.8086656441717793e-06, + "objective/entropy": 55.36757278442383, + "objective/kl": 5.684802055358887, + "objective/non_score_reward": -0.5684801936149597, + "objective/rlhf_reward": -1.873920848965645, + "objective/scores": 0.1, + "policy/approxkl_avg": 0.7184413075447083, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48540106415748596, + "step": 998, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0035548210144043 + }, + { + "episode": 16000, + "epoch": 0.28759391738864726, + "loss/policy_avg": 0.12601952254772186, + "lr": 2.808473926380368e-06, + "objective/entropy": -26.834529876708984, + "objective/kl": 9.361145973205566, + "objective/non_score_reward": -0.9361146688461304, + "objective/rlhf_reward": -0.8207396909010141, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 71.52543640136719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6758915185928345, + "step": 999, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973740577697754 + }, + { + "episode": 16016, + "epoch": 0.2878815113060359, + "loss/policy_avg": 0.9146034121513367, + "lr": 2.808282208588957e-06, + "objective/entropy": 227.4399871826172, + "objective/kl": 11.205362319946289, + "objective/non_score_reward": -1.120536208152771, + "objective/rlhf_reward": -4.082144802808761, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.882693290710449, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5677628517150879, + "step": 1000, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989690780639648 + }, + { + "episode": 16032, + "epoch": 0.28816910522342454, + "loss/policy_avg": 0.17238189280033112, + "lr": 2.808090490797546e-06, + "objective/entropy": 33.76301193237305, + "objective/kl": 22.955963134765625, + "objective/non_score_reward": -2.295596122741699, + "objective/rlhf_reward": -8.782385206222534, + "objective/scores": 0.1, + "policy/approxkl_avg": 231.3272705078125, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6772900223731995, + "step": 1001, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999369382858276 + }, + { + "episode": 16048, + "epoch": 0.2884566991408132, + "loss/policy_avg": 0.2179027795791626, + "lr": 2.807898773006135e-06, + "objective/entropy": 165.42457580566406, + "objective/kl": 12.760942459106445, + "objective/non_score_reward": -1.2760944366455078, + "objective/rlhf_reward": -0.7043775081634518, + "objective/scores": 1.1, + "policy/approxkl_avg": 23.380008697509766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6448920965194702, + "step": 1002, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0009689331054688 + }, + { + "episode": 16064, + "epoch": 0.2887442930582018, + "loss/policy_avg": 0.21163496375083923, + "lr": 2.8077070552147242e-06, + "objective/entropy": 219.0919647216797, + "objective/kl": 17.551136016845703, + "objective/non_score_reward": -1.7551138401031494, + "objective/rlhf_reward": -6.620455002784729, + "objective/scores": 0.1, + "policy/approxkl_avg": 38.19704055786133, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5705841779708862, + "step": 1003, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992287158966064 + }, + { + "episode": 16080, + "epoch": 0.28903188697559046, + "loss/policy_avg": 0.2837230861186981, + "lr": 2.807515337423313e-06, + "objective/entropy": -219.6112060546875, + "objective/kl": 18.70673942565918, + "objective/non_score_reward": -1.8706741333007812, + "objective/rlhf_reward": -7.082696056365966, + "objective/scores": 0.1, + "policy/approxkl_avg": 109.35348510742188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.60931396484375, + "step": 1004, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979100227355957 + }, + { + "episode": 16096, + "epoch": 0.2893194808929791, + "loss/policy_avg": -0.19084030389785767, + "lr": 2.807323619631902e-06, + "objective/entropy": -65.57774353027344, + "objective/kl": 7.905247211456299, + "objective/non_score_reward": -0.7905246019363403, + "objective/rlhf_reward": -5.162098407745361, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.45587921142578, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5015524625778198, + "step": 1005, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000932216644287 + }, + { + "episode": 16112, + "epoch": 0.28960707481036774, + "loss/policy_avg": 0.08700132369995117, + "lr": 2.807131901840491e-06, + "objective/entropy": 273.9281311035156, + "objective/kl": 13.745208740234375, + "objective/non_score_reward": -1.3745208978652954, + "objective/rlhf_reward": -3.375377314464126, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 66.57199096679688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6697818040847778, + "step": 1006, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990549087524414 + }, + { + "episode": 16128, + "epoch": 0.28989466872775643, + "loss/policy_avg": 0.39911019802093506, + "lr": 2.80694018404908e-06, + "objective/entropy": 123.5094223022461, + "objective/kl": 19.111557006835938, + "objective/non_score_reward": -1.9111559391021729, + "objective/rlhf_reward": -5.8197947695580226, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 223.53636169433594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5573745965957642, + "step": 1007, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973094463348389 + }, + { + "episode": 16144, + "epoch": 0.2901822626451451, + "loss/policy_avg": 0.4091383218765259, + "lr": 2.8067484662576687e-06, + "objective/entropy": -59.11369323730469, + "objective/kl": 10.690498352050781, + "objective/non_score_reward": -1.0690498352050781, + "objective/rlhf_reward": -3.8761992067098614, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.191486358642578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5697863101959229, + "step": 1008, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996948480606079 + }, + { + "episode": 16160, + "epoch": 0.2904698565625337, + "loss/policy_avg": 0.144743874669075, + "lr": 2.806556748466258e-06, + "objective/entropy": 78.24107360839844, + "objective/kl": 16.85831642150879, + "objective/non_score_reward": -1.6858315467834473, + "objective/rlhf_reward": -6.343326365947723, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.000248908996582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8502017855644226, + "step": 1009, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001063346862793 + }, + { + "episode": 16176, + "epoch": 0.29075745047992235, + "loss/policy_avg": 0.14420843124389648, + "lr": 2.8063650306748467e-06, + "objective/entropy": 98.99222564697266, + "objective/kl": 15.894691467285156, + "objective/non_score_reward": -1.589469075202942, + "objective/rlhf_reward": -5.957876390218734, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.941476821899414, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7167706489562988, + "step": 1010, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974000453948975 + }, + { + "episode": 16192, + "epoch": 0.291045044397311, + "loss/policy_avg": 0.11632807552814484, + "lr": 2.806173312883436e-06, + "objective/entropy": 55.578338623046875, + "objective/kl": 10.901174545288086, + "objective/non_score_reward": -1.0901174545288086, + "objective/rlhf_reward": -6.360469818115234, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.527301788330078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7306192517280579, + "step": 1011, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974496364593506 + }, + { + "episode": 16208, + "epoch": 0.29133263831469963, + "loss/policy_avg": 0.48613467812538147, + "lr": 2.8059815950920244e-06, + "objective/entropy": 96.17320251464844, + "objective/kl": 11.01476001739502, + "objective/non_score_reward": -1.1014759540557861, + "objective/rlhf_reward": -4.0059039950370785, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.446365356445312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6368293762207031, + "step": 1012, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981093406677246 + }, + { + "episode": 16224, + "epoch": 0.2916202322320883, + "loss/policy_avg": 0.8086050748825073, + "lr": 2.8057898773006136e-06, + "objective/entropy": 50.43046188354492, + "objective/kl": 15.851388931274414, + "objective/non_score_reward": -1.5851387977600098, + "objective/rlhf_reward": -1.9405554294586178, + "objective/scores": 1.1, + "policy/approxkl_avg": 146.61557006835938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6533379554748535, + "step": 1013, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996522068977356 + }, + { + "episode": 16240, + "epoch": 0.2919078261494769, + "loss/policy_avg": 0.5354580879211426, + "lr": 2.8055981595092024e-06, + "objective/entropy": -83.87467193603516, + "objective/kl": 12.357803344726562, + "objective/non_score_reward": -1.2357802391052246, + "objective/rlhf_reward": -2.5431207478046414, + "objective/scores": 0.6, + "policy/approxkl_avg": 74.94430541992188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6065086722373962, + "step": 1014, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9968059062957764 + }, + { + "episode": 16256, + "epoch": 0.2921954200668656, + "loss/policy_avg": 0.10914095491170883, + "lr": 2.805406441717791e-06, + "objective/entropy": 68.0996322631836, + "objective/kl": 20.283634185791016, + "objective/non_score_reward": -2.0283632278442383, + "objective/rlhf_reward": -7.713453269004821, + "objective/scores": 0.1, + "policy/approxkl_avg": 97.01466369628906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47718697786331177, + "step": 1015, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000540256500244 + }, + { + "episode": 16272, + "epoch": 0.29248301398425425, + "loss/policy_avg": 0.41233736276626587, + "lr": 2.8052147239263804e-06, + "objective/entropy": 203.62808227539062, + "objective/kl": 19.823139190673828, + "objective/non_score_reward": -1.982313632965088, + "objective/rlhf_reward": -9.929254531860352, + "objective/scores": -0.5, + "policy/approxkl_avg": 127.52713775634766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8571972250938416, + "step": 1016, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9971632957458496 + }, + { + "episode": 16288, + "epoch": 0.2927706079016429, + "loss/policy_avg": 0.33414748311042786, + "lr": 2.8050230061349693e-06, + "objective/entropy": -66.33137512207031, + "objective/kl": 16.306615829467773, + "objective/non_score_reward": -1.6306617259979248, + "objective/rlhf_reward": -2.122646844387054, + "objective/scores": 1.1, + "policy/approxkl_avg": 23.996356964111328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6071459054946899, + "step": 1017, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0003702640533447 + }, + { + "episode": 16304, + "epoch": 0.2930582018190315, + "loss/policy_avg": 0.2442750632762909, + "lr": 2.8048312883435585e-06, + "objective/entropy": 91.87252807617188, + "objective/kl": 14.24521541595459, + "objective/non_score_reward": -1.4245214462280273, + "objective/rlhf_reward": -3.575379910246406, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 26.111114501953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9083739519119263, + "step": 1018, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987156391143799 + }, + { + "episode": 16320, + "epoch": 0.29334579573642017, + "loss/policy_avg": 0.755403995513916, + "lr": 2.8046395705521473e-06, + "objective/entropy": -104.68330383300781, + "objective/kl": 8.859827041625977, + "objective/non_score_reward": -0.8859825134277344, + "objective/rlhf_reward": -5.5439300537109375, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.549600601196289, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7196685671806335, + "step": 1019, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999251127243042 + }, + { + "episode": 16336, + "epoch": 0.2936333896538088, + "loss/policy_avg": 0.1244896948337555, + "lr": 2.804447852760736e-06, + "objective/entropy": 156.35601806640625, + "objective/kl": 9.711006164550781, + "objective/non_score_reward": -0.9711006879806519, + "objective/rlhf_reward": -1.4844029307365418, + "objective/scores": 0.6, + "policy/approxkl_avg": 23.540321350097656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5550477504730225, + "step": 1020, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976718425750732 + }, + { + "episode": 16352, + "epoch": 0.29392098357119745, + "loss/policy_avg": 0.37241601943969727, + "lr": 2.8042561349693253e-06, + "objective/entropy": 68.94697570800781, + "objective/kl": 13.95817756652832, + "objective/non_score_reward": -1.395817756652832, + "objective/rlhf_reward": -7.583271026611328, + "objective/scores": -0.5, + "policy/approxkl_avg": 20.297168731689453, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6176844835281372, + "step": 1021, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974250793457031 + }, + { + "episode": 16368, + "epoch": 0.29420857748858614, + "loss/policy_avg": 0.48266342282295227, + "lr": 2.804064417177914e-06, + "objective/entropy": 129.90074157714844, + "objective/kl": 19.586383819580078, + "objective/non_score_reward": -1.9586385488510132, + "objective/rlhf_reward": -4.9108348234903545, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 28.53821563720703, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5636917352676392, + "step": 1022, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997184157371521 + }, + { + "episode": 16384, + "epoch": 0.2944961714059748, + "loss/policy_avg": 0.39663034677505493, + "lr": 2.803872699386503e-06, + "objective/entropy": -57.39234924316406, + "objective/kl": 14.80410385131836, + "objective/non_score_reward": -1.4804103374481201, + "objective/rlhf_reward": -3.521641409397125, + "objective/scores": 0.6, + "policy/approxkl_avg": 97.82295227050781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7238129377365112, + "step": 1023, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998586893081665 + }, + { + "episode": 16400, + "epoch": 0.2947837653233634, + "loss/policy_avg": 0.39789485931396484, + "lr": 2.803680981595092e-06, + "objective/entropy": 19.74584197998047, + "objective/kl": 13.682332992553711, + "objective/non_score_reward": -1.368233323097229, + "objective/rlhf_reward": -5.072933173179626, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.714088439941406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7020970582962036, + "step": 1024, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982075691223145 + }, + { + "episode": 16416, + "epoch": 0.29507135924075206, + "loss/policy_avg": -0.1076928898692131, + "lr": 2.803489263803681e-06, + "objective/entropy": -72.29051208496094, + "objective/kl": 19.678255081176758, + "objective/non_score_reward": -1.9678255319595337, + "objective/rlhf_reward": -3.4713020682334896, + "objective/scores": 1.1, + "policy/approxkl_avg": 158.48939514160156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6535936594009399, + "step": 1025, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994299411773682 + }, + { + "episode": 16432, + "epoch": 0.2953589531581407, + "loss/policy_avg": 0.2257157564163208, + "lr": 2.8032975460122702e-06, + "objective/entropy": 12.529336929321289, + "objective/kl": 9.745655059814453, + "objective/non_score_reward": -0.9745655059814453, + "objective/rlhf_reward": 0.5017380058765415, + "objective/scores": 1.1, + "policy/approxkl_avg": 35.45285415649414, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6599619388580322, + "step": 1026, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0049285888671875 + }, + { + "episode": 16448, + "epoch": 0.29564654707552934, + "loss/policy_avg": 0.3417929410934448, + "lr": 2.803105828220859e-06, + "objective/entropy": 29.500675201416016, + "objective/kl": 17.312862396240234, + "objective/non_score_reward": -1.7312862873077393, + "objective/rlhf_reward": -2.525145328044891, + "objective/scores": 1.1, + "policy/approxkl_avg": 118.51178741455078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4447431266307831, + "step": 1027, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997718095779419 + }, + { + "episode": 16464, + "epoch": 0.295934140992918, + "loss/policy_avg": 0.7444425821304321, + "lr": 2.802914110429448e-06, + "objective/entropy": -81.61296844482422, + "objective/kl": 15.141607284545898, + "objective/non_score_reward": -1.5141608715057373, + "objective/rlhf_reward": -3.6566432178020474, + "objective/scores": 0.6, + "policy/approxkl_avg": 52.98291015625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7052822113037109, + "step": 1028, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991610050201416 + }, + { + "episode": 16480, + "epoch": 0.2962217349103066, + "loss/policy_avg": 0.7192173004150391, + "lr": 2.802722392638037e-06, + "objective/entropy": 68.40217590332031, + "objective/kl": 19.928909301757812, + "objective/non_score_reward": -1.9928908348083496, + "objective/rlhf_reward": -3.571563637256622, + "objective/scores": 1.1, + "policy/approxkl_avg": 217.04953002929688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5880913734436035, + "step": 1029, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001051664352417 + }, + { + "episode": 16496, + "epoch": 0.2965093288276953, + "loss/policy_avg": 0.36680111289024353, + "lr": 2.802530674846626e-06, + "objective/entropy": 77.72987365722656, + "objective/kl": 8.90180492401123, + "objective/non_score_reward": -0.8901805877685547, + "objective/rlhf_reward": 0.8392777085304264, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.994407653808594, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5440889596939087, + "step": 1030, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000692129135132 + }, + { + "episode": 16512, + "epoch": 0.29679692274508396, + "loss/policy_avg": -0.17457365989685059, + "lr": 2.802338957055215e-06, + "objective/entropy": 81.9105224609375, + "objective/kl": 9.55565071105957, + "objective/non_score_reward": -0.9555650949478149, + "objective/rlhf_reward": -3.422260200977325, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.63282585144043, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6269766092300415, + "step": 1031, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.007657527923584 + }, + { + "episode": 16528, + "epoch": 0.2970845166624726, + "loss/policy_avg": 0.24074256420135498, + "lr": 2.802147239263804e-06, + "objective/entropy": 210.78358459472656, + "objective/kl": 16.9814510345459, + "objective/non_score_reward": -1.6981453895568848, + "objective/rlhf_reward": -6.392581349611282, + "objective/scores": 0.1, + "policy/approxkl_avg": 152.18362426757812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5589526295661926, + "step": 1032, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999018669128418 + }, + { + "episode": 16544, + "epoch": 0.29737211057986124, + "loss/policy_avg": 0.18762660026550293, + "lr": 2.8019555214723927e-06, + "objective/entropy": 71.11669921875, + "objective/kl": 13.297914505004883, + "objective/non_score_reward": -1.32979154586792, + "objective/rlhf_reward": -7.31916618347168, + "objective/scores": -0.5, + "policy/approxkl_avg": 57.81365966796875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.690626859664917, + "step": 1033, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998163104057312 + }, + { + "episode": 16560, + "epoch": 0.2976597044972499, + "loss/policy_avg": 0.18001145124435425, + "lr": 2.8017638036809816e-06, + "objective/entropy": 168.59170532226562, + "objective/kl": 16.74738121032715, + "objective/non_score_reward": -1.6747379302978516, + "objective/rlhf_reward": -2.29895190000534, + "objective/scores": 1.1, + "policy/approxkl_avg": 25.452075958251953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5927215218544006, + "step": 1034, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000150203704834 + }, + { + "episode": 16576, + "epoch": 0.2979472984146385, + "loss/policy_avg": 0.14382623136043549, + "lr": 2.8015720858895704e-06, + "objective/entropy": 6.768333435058594, + "objective/kl": 10.177921295166016, + "objective/non_score_reward": -1.0177922248840332, + "objective/rlhf_reward": -1.1474501236688819, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.347600936889648, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7635913491249084, + "step": 1035, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986321926116943 + }, + { + "episode": 16592, + "epoch": 0.29823489233202716, + "loss/policy_avg": -0.041412293910980225, + "lr": 2.8013803680981596e-06, + "objective/entropy": 208.7352752685547, + "objective/kl": 12.503351211547852, + "objective/non_score_reward": -1.2503352165222168, + "objective/rlhf_reward": -7.001340866088867, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.922041893005371, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5301596522331238, + "step": 1036, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0028204917907715 + }, + { + "episode": 16608, + "epoch": 0.2985224862494158, + "loss/policy_avg": 0.2591552138328552, + "lr": 2.8011886503067484e-06, + "objective/entropy": 192.43026733398438, + "objective/kl": 16.139490127563477, + "objective/non_score_reward": -1.6139490604400635, + "objective/rlhf_reward": -4.793936526001083, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 114.85481262207031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7047553062438965, + "step": 1037, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972035884857178 + }, + { + "episode": 16624, + "epoch": 0.2988100801668045, + "loss/policy_avg": 0.7857614755630493, + "lr": 2.8009969325153372e-06, + "objective/entropy": -56.200775146484375, + "objective/kl": 15.523076057434082, + "objective/non_score_reward": -1.5523076057434082, + "objective/rlhf_reward": -1.80923039317131, + "objective/scores": 1.1, + "policy/approxkl_avg": 18.877765655517578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6590103507041931, + "step": 1038, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990099668502808 + }, + { + "episode": 16640, + "epoch": 0.29909767408419313, + "loss/policy_avg": 0.22429285943508148, + "lr": 2.8008052147239264e-06, + "objective/entropy": -137.72122192382812, + "objective/kl": 10.073450088500977, + "objective/non_score_reward": -1.0073450803756714, + "objective/rlhf_reward": -3.629380321502685, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.942007064819336, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5248731374740601, + "step": 1039, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995999336242676 + }, + { + "episode": 16656, + "epoch": 0.2993852680015818, + "loss/policy_avg": 0.329068124294281, + "lr": 2.8006134969325153e-06, + "objective/entropy": -55.219940185546875, + "objective/kl": 17.680944442749023, + "objective/non_score_reward": -1.7680946588516235, + "objective/rlhf_reward": -2.672378635406494, + "objective/scores": 1.1, + "policy/approxkl_avg": 83.21047973632812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5964963436126709, + "step": 1040, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971622228622437 + }, + { + "episode": 16672, + "epoch": 0.2996728619189704, + "loss/policy_avg": 0.15215516090393066, + "lr": 2.8004217791411045e-06, + "objective/entropy": 75.79779815673828, + "objective/kl": 19.695858001708984, + "objective/non_score_reward": -1.9695857763290405, + "objective/rlhf_reward": -3.4783432245254513, + "objective/scores": 1.1, + "policy/approxkl_avg": 33.504329681396484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6170982122421265, + "step": 1041, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998176097869873 + }, + { + "episode": 16688, + "epoch": 0.29996045583635905, + "loss/policy_avg": 0.41010716557502747, + "lr": 2.8002300613496933e-06, + "objective/entropy": 16.160049438476562, + "objective/kl": 10.406661033630371, + "objective/non_score_reward": -1.040666103363037, + "objective/rlhf_reward": -2.5008046976929768, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 4.847574234008789, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6609134078025818, + "step": 1042, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0026755332946777 + }, + { + "episode": 16704, + "epoch": 0.3002480497537477, + "loss/policy_avg": 0.26580554246902466, + "lr": 2.800038343558282e-06, + "objective/entropy": 27.727663040161133, + "objective/kl": 11.889737129211426, + "objective/non_score_reward": -1.1889736652374268, + "objective/rlhf_reward": -4.35589433312416, + "objective/scores": 0.1, + "policy/approxkl_avg": 135.2098388671875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7928782105445862, + "step": 1043, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992873668670654 + }, + { + "episode": 16720, + "epoch": 0.30053564367113633, + "loss/policy_avg": 0.2747136950492859, + "lr": 2.7998466257668713e-06, + "objective/entropy": 174.1541290283203, + "objective/kl": 14.657130241394043, + "objective/non_score_reward": -1.4657130241394043, + "objective/rlhf_reward": -5.462852334976196, + "objective/scores": 0.1, + "policy/approxkl_avg": 56.979190826416016, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.9884006977081299, + "step": 1044, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0000054836273193 + }, + { + "episode": 16736, + "epoch": 0.300823237588525, + "loss/policy_avg": 0.5620021820068359, + "lr": 2.79965490797546e-06, + "objective/entropy": -178.74172973632812, + "objective/kl": 18.4420108795166, + "objective/non_score_reward": -1.8442012071609497, + "objective/rlhf_reward": -5.254098551646743, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 104.42015075683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6059384346008301, + "step": 1045, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990811347961426 + }, + { + "episode": 16752, + "epoch": 0.30111083150591367, + "loss/policy_avg": 0.24908019602298737, + "lr": 2.799463190184049e-06, + "objective/entropy": 147.04888916015625, + "objective/kl": 13.451667785644531, + "objective/non_score_reward": -1.3451666831970215, + "objective/rlhf_reward": -3.433255563454564, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 20.122337341308594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6942051649093628, + "step": 1046, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999792575836182 + }, + { + "episode": 16768, + "epoch": 0.3013984254233023, + "loss/policy_avg": 0.45103228092193604, + "lr": 2.799271472392638e-06, + "objective/entropy": -139.34446716308594, + "objective/kl": 10.701043128967285, + "objective/non_score_reward": -1.0701043605804443, + "objective/rlhf_reward": 0.11958267688751256, + "objective/scores": 1.1, + "policy/approxkl_avg": 8.23202133178711, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6052207946777344, + "step": 1047, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9994077682495117 + }, + { + "episode": 16784, + "epoch": 0.30168601934069095, + "loss/policy_avg": 0.12558560073375702, + "lr": 2.799079754601227e-06, + "objective/entropy": 68.8547134399414, + "objective/kl": 11.65601634979248, + "objective/non_score_reward": -1.1656014919281006, + "objective/rlhf_reward": -2.2624060869216915, + "objective/scores": 0.6, + "policy/approxkl_avg": 29.906028747558594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5991178750991821, + "step": 1048, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000406265258789 + }, + { + "episode": 16800, + "epoch": 0.3019736132580796, + "loss/policy_avg": -0.44990408420562744, + "lr": 2.7988880368098162e-06, + "objective/entropy": 197.53802490234375, + "objective/kl": 11.517388343811035, + "objective/non_score_reward": -1.1517388820648193, + "objective/rlhf_reward": -1.6832366927873819, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 103.94009399414062, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7234064340591431, + "step": 1049, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0036420822143555 + }, + { + "episode": 16816, + "epoch": 0.3022612071754682, + "loss/policy_avg": -0.47146177291870117, + "lr": 2.798696319018405e-06, + "objective/entropy": 12.529426574707031, + "objective/kl": 8.463263511657715, + "objective/non_score_reward": -0.846326470375061, + "objective/rlhf_reward": -1.651972428957621, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 23.17457389831543, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6993383169174194, + "step": 1050, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000576972961426 + }, + { + "episode": 16832, + "epoch": 0.30254880109285687, + "loss/policy_avg": 0.27615439891815186, + "lr": 2.798504601226994e-06, + "objective/entropy": 75.7374496459961, + "objective/kl": 13.605602264404297, + "objective/non_score_reward": -1.3605601787567139, + "objective/rlhf_reward": -7.442241191864014, + "objective/scores": -0.5, + "policy/approxkl_avg": 184.4613800048828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6710618138313293, + "step": 1051, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975073337554932 + }, + { + "episode": 16848, + "epoch": 0.3028363950102455, + "loss/policy_avg": 0.2937107980251312, + "lr": 2.798312883435583e-06, + "objective/entropy": 61.35673141479492, + "objective/kl": 18.245712280273438, + "objective/non_score_reward": -1.8245713710784912, + "objective/rlhf_reward": -4.374566708446714, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 35.17201614379883, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6811769008636475, + "step": 1052, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998936653137207 + }, + { + "episode": 16864, + "epoch": 0.3031239889276342, + "loss/policy_avg": 0.3230166733264923, + "lr": 2.798121165644172e-06, + "objective/entropy": -3.2104339599609375, + "objective/kl": 12.775344848632812, + "objective/non_score_reward": -1.2775344848632812, + "objective/rlhf_reward": -2.987431766764198, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 53.64230728149414, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6872749328613281, + "step": 1053, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.995277762413025 + }, + { + "episode": 16880, + "epoch": 0.30341158284502284, + "loss/policy_avg": 0.0927608460187912, + "lr": 2.797929447852761e-06, + "objective/entropy": 85.36604309082031, + "objective/kl": 15.402597427368164, + "objective/non_score_reward": -1.5402597188949585, + "objective/rlhf_reward": -5.761038935184478, + "objective/scores": 0.1, + "policy/approxkl_avg": 74.3502426147461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.605860710144043, + "step": 1054, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997683763504028 + }, + { + "episode": 16896, + "epoch": 0.3036991767624115, + "loss/policy_avg": 0.44006651639938354, + "lr": 2.79773773006135e-06, + "objective/entropy": -40.1391487121582, + "objective/kl": 17.43438720703125, + "objective/non_score_reward": -1.743438720703125, + "objective/rlhf_reward": -6.573754703998565, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.484825134277344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5174025297164917, + "step": 1055, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002403259277344 + }, + { + "episode": 16912, + "epoch": 0.3039867706798001, + "loss/policy_avg": 0.014260075986385345, + "lr": 2.7975460122699388e-06, + "objective/entropy": -85.97489929199219, + "objective/kl": 8.647266387939453, + "objective/non_score_reward": -0.8647266626358032, + "objective/rlhf_reward": -5.458906173706055, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.572721481323242, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4460605978965759, + "step": 1056, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998361349105835 + }, + { + "episode": 16928, + "epoch": 0.30427436459718876, + "loss/policy_avg": 0.047507576644420624, + "lr": 2.7973542944785276e-06, + "objective/entropy": 52.07126235961914, + "objective/kl": 15.33387565612793, + "objective/non_score_reward": -1.5333876609802246, + "objective/rlhf_reward": -5.73355042040348, + "objective/scores": 0.1, + "policy/approxkl_avg": 77.14869689941406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.703601598739624, + "step": 1057, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998363256454468 + }, + { + "episode": 16944, + "epoch": 0.3045619585145774, + "loss/policy_avg": 0.8822598457336426, + "lr": 2.7971625766871164e-06, + "objective/entropy": 198.6245880126953, + "objective/kl": 13.298235893249512, + "objective/non_score_reward": -1.329823613166809, + "objective/rlhf_reward": -7.319294452667236, + "objective/scores": -0.5, + "policy/approxkl_avg": 109.0306396484375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6059063673019409, + "step": 1058, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981026649475098 + }, + { + "episode": 16960, + "epoch": 0.30484955243196604, + "loss/policy_avg": 0.1904626488685608, + "lr": 2.7969708588957056e-06, + "objective/entropy": 218.47293090820312, + "objective/kl": 12.414791107177734, + "objective/non_score_reward": -1.2414791584014893, + "objective/rlhf_reward": -4.565916454792022, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.559885501861572, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5881438255310059, + "step": 1059, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9983572959899902 + }, + { + "episode": 16976, + "epoch": 0.30513714634935474, + "loss/policy_avg": 0.29011303186416626, + "lr": 2.7967791411042944e-06, + "objective/entropy": 30.57851791381836, + "objective/kl": 10.664090156555176, + "objective/non_score_reward": -1.0664091110229492, + "objective/rlhf_reward": -6.265636444091797, + "objective/scores": -0.5, + "policy/approxkl_avg": 52.796512603759766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7438642978668213, + "step": 1060, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986371994018555 + }, + { + "episode": 16992, + "epoch": 0.3054247402667434, + "loss/policy_avg": 0.49531346559524536, + "lr": 2.7965874233128832e-06, + "objective/entropy": 66.80797576904297, + "objective/kl": 18.846420288085938, + "objective/non_score_reward": -1.8846420049667358, + "objective/rlhf_reward": -9.538568496704102, + "objective/scores": -0.5, + "policy/approxkl_avg": 144.69943237304688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48985937237739563, + "step": 1061, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985793828964233 + }, + { + "episode": 17008, + "epoch": 0.305712334184132, + "loss/policy_avg": 0.423782616853714, + "lr": 2.7963957055214725e-06, + "objective/entropy": 36.09656524658203, + "objective/kl": 16.3355655670166, + "objective/non_score_reward": -1.633556604385376, + "objective/rlhf_reward": -8.534226417541504, + "objective/scores": -0.5, + "policy/approxkl_avg": 97.47111511230469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8207142353057861, + "step": 1062, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9972548484802246 + }, + { + "episode": 17024, + "epoch": 0.30599992810152066, + "loss/policy_avg": 0.569042980670929, + "lr": 2.7962039877300613e-06, + "objective/entropy": -27.293312072753906, + "objective/kl": 17.702495574951172, + "objective/non_score_reward": -1.770249605178833, + "objective/rlhf_reward": -5.256169851097177, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 163.7218475341797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.38791435956954956, + "step": 1063, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989525079727173 + }, + { + "episode": 17040, + "epoch": 0.3062875220189093, + "loss/policy_avg": 0.45904070138931274, + "lr": 2.7960122699386505e-06, + "objective/entropy": 110.01829528808594, + "objective/kl": 17.024635314941406, + "objective/non_score_reward": -1.7024635076522827, + "objective/rlhf_reward": -6.40985426902771, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.4630126953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5167896747589111, + "step": 1064, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971375465393066 + }, + { + "episode": 17056, + "epoch": 0.30657511593629794, + "loss/policy_avg": 0.446601927280426, + "lr": 2.7958205521472393e-06, + "objective/entropy": 173.10923767089844, + "objective/kl": 11.851961135864258, + "objective/non_score_reward": -1.1851961612701416, + "objective/rlhf_reward": -4.340784585475921, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.621267318725586, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4809800982475281, + "step": 1065, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0021729469299316 + }, + { + "episode": 17072, + "epoch": 0.3068627098536866, + "loss/policy_avg": -0.12707829475402832, + "lr": 2.795628834355828e-06, + "objective/entropy": 107.12103271484375, + "objective/kl": 15.33704662322998, + "objective/non_score_reward": -1.5337047576904297, + "objective/rlhf_reward": -8.134819030761719, + "objective/scores": -0.5, + "policy/approxkl_avg": 81.38722229003906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6840176582336426, + "step": 1066, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004630088806152 + }, + { + "episode": 17088, + "epoch": 0.3071503037710752, + "loss/policy_avg": 0.6813977360725403, + "lr": 2.7954371165644174e-06, + "objective/entropy": 69.50862121582031, + "objective/kl": 11.214311599731445, + "objective/non_score_reward": -1.1214311122894287, + "objective/rlhf_reward": -4.085724315047264, + "objective/scores": 0.1, + "policy/approxkl_avg": 110.50018310546875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.43184494972229004, + "step": 1067, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000887155532837 + }, + { + "episode": 17104, + "epoch": 0.3074378976884639, + "loss/policy_avg": -0.03066210076212883, + "lr": 2.795245398773006e-06, + "objective/entropy": 105.03985595703125, + "objective/kl": 13.49374008178711, + "objective/non_score_reward": -1.3493739366531372, + "objective/rlhf_reward": -7.397495746612549, + "objective/scores": -0.5, + "policy/approxkl_avg": 32.43345642089844, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4681611657142639, + "step": 1068, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0003468990325928 + }, + { + "episode": 17120, + "epoch": 0.30772549160585255, + "loss/policy_avg": 0.0021042972803115845, + "lr": 2.7950536809815954e-06, + "objective/entropy": 9.283809661865234, + "objective/kl": 17.915706634521484, + "objective/non_score_reward": -1.791570782661438, + "objective/rlhf_reward": -6.766283369064331, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.99706268310547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7062077522277832, + "step": 1069, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001181125640869 + }, + { + "episode": 17136, + "epoch": 0.3080130855232412, + "loss/policy_avg": 0.6168322563171387, + "lr": 2.794861963190184e-06, + "objective/entropy": 141.08819580078125, + "objective/kl": 11.856430053710938, + "objective/non_score_reward": -1.185642957687378, + "objective/rlhf_reward": -4.34257218837738, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.94817352294922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4839014708995819, + "step": 1070, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996006488800049 + }, + { + "episode": 17152, + "epoch": 0.30830067944062983, + "loss/policy_avg": 0.4380769431591034, + "lr": 2.794670245398773e-06, + "objective/entropy": 152.61181640625, + "objective/kl": 7.851347923278809, + "objective/non_score_reward": -0.7851347923278809, + "objective/rlhf_reward": -5.140539169311523, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.251985549926758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6197154521942139, + "step": 1071, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996599793434143 + }, + { + "episode": 17168, + "epoch": 0.3085882733580185, + "loss/policy_avg": 0.3455006182193756, + "lr": 2.7944785276073622e-06, + "objective/entropy": -161.29962158203125, + "objective/kl": 14.089064598083496, + "objective/non_score_reward": -1.4089064598083496, + "objective/rlhf_reward": -7.635625839233398, + "objective/scores": -0.5, + "policy/approxkl_avg": 83.3675537109375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6021291613578796, + "step": 1072, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0006675720214844 + }, + { + "episode": 17184, + "epoch": 0.3088758672754071, + "loss/policy_avg": -0.06648188829421997, + "lr": 2.794286809815951e-06, + "objective/entropy": 6.721179962158203, + "objective/kl": 12.68543815612793, + "objective/non_score_reward": -1.2685437202453613, + "objective/rlhf_reward": -7.074174880981445, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.025358200073242, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.9224163293838501, + "step": 1073, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99924635887146 + }, + { + "episode": 17200, + "epoch": 0.30916346119279575, + "loss/policy_avg": 0.10548116266727448, + "lr": 2.79409509202454e-06, + "objective/entropy": -63.38032531738281, + "objective/kl": 14.34760570526123, + "objective/non_score_reward": -1.434760570526123, + "objective/rlhf_reward": -3.7916312915849044, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 5.972915172576904, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5976482033729553, + "step": 1074, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9976732730865479 + }, + { + "episode": 17216, + "epoch": 0.3094510551101844, + "loss/policy_avg": 0.31662654876708984, + "lr": 2.793903374233129e-06, + "objective/entropy": 161.00772094726562, + "objective/kl": 8.747349739074707, + "objective/non_score_reward": -0.8747349381446838, + "objective/rlhf_reward": -3.0989399313926693, + "objective/scores": 0.1, + "policy/approxkl_avg": 59.20728302001953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4308844804763794, + "step": 1075, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99920654296875 + }, + { + "episode": 17232, + "epoch": 0.3097386490275731, + "loss/policy_avg": 0.5542811751365662, + "lr": 2.793711656441718e-06, + "objective/entropy": 210.3388671875, + "objective/kl": 10.123682975769043, + "objective/non_score_reward": -1.0123682022094727, + "objective/rlhf_reward": -3.649472868442535, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.41395378112793, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8292099237442017, + "step": 1076, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9961988925933838 + }, + { + "episode": 17248, + "epoch": 0.3100262429449617, + "loss/policy_avg": 0.8028037548065186, + "lr": 2.793519938650307e-06, + "objective/entropy": 0.7303447723388672, + "objective/kl": 17.22289276123047, + "objective/non_score_reward": -1.7222893238067627, + "objective/rlhf_reward": -8.88915729522705, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.21015167236328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5208621621131897, + "step": 1077, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992649555206299 + }, + { + "episode": 17264, + "epoch": 0.31031383686235037, + "loss/policy_avg": 0.3188959062099457, + "lr": 2.793328220858896e-06, + "objective/entropy": -157.26812744140625, + "objective/kl": 9.934246063232422, + "objective/non_score_reward": -0.993424654006958, + "objective/rlhf_reward": -5.973698616027832, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.13273620605469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.831667423248291, + "step": 1078, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.998191475868225 + }, + { + "episode": 17280, + "epoch": 0.310601430779739, + "loss/policy_avg": 0.7122023105621338, + "lr": 2.7931365030674848e-06, + "objective/entropy": -39.97633361816406, + "objective/kl": 20.920761108398438, + "objective/non_score_reward": -2.092076301574707, + "objective/rlhf_reward": -10.368305206298828, + "objective/scores": -0.5, + "policy/approxkl_avg": 220.3436279296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6899431943893433, + "step": 1079, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980072975158691 + }, + { + "episode": 17296, + "epoch": 0.31088902469712765, + "loss/policy_avg": 0.22102391719818115, + "lr": 2.7929447852760736e-06, + "objective/entropy": 64.91043090820312, + "objective/kl": 12.331470489501953, + "objective/non_score_reward": -1.233147144317627, + "objective/rlhf_reward": -4.532588696479797, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.989774703979492, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6464887857437134, + "step": 1080, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0013198852539062 + }, + { + "episode": 17312, + "epoch": 0.3111766186145163, + "loss/policy_avg": 0.10684894025325775, + "lr": 2.7927530674846624e-06, + "objective/entropy": 99.40476989746094, + "objective/kl": 13.806692123413086, + "objective/non_score_reward": -1.380669116973877, + "objective/rlhf_reward": -3.7893431643644964, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 28.995933532714844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5542625188827515, + "step": 1081, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000174045562744 + }, + { + "episode": 17328, + "epoch": 0.3114642125319049, + "loss/policy_avg": 0.30782192945480347, + "lr": 2.7925613496932516e-06, + "objective/entropy": 6.308849334716797, + "objective/kl": 8.870867729187012, + "objective/non_score_reward": -0.8870867490768433, + "objective/rlhf_reward": -1.4256407044091561, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 30.72797203063965, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6196576952934265, + "step": 1082, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000112533569336 + }, + { + "episode": 17344, + "epoch": 0.3117518064492936, + "loss/policy_avg": 0.3716828227043152, + "lr": 2.7923696319018404e-06, + "objective/entropy": 238.0005340576172, + "objective/kl": 11.961174011230469, + "objective/non_score_reward": -1.1961175203323364, + "objective/rlhf_reward": -0.3844699472188946, + "objective/scores": 1.1, + "policy/approxkl_avg": 157.48333740234375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7168400287628174, + "step": 1083, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9966548681259155 + }, + { + "episode": 17360, + "epoch": 0.31203940036668226, + "loss/policy_avg": 0.11649422347545624, + "lr": 2.7921779141104297e-06, + "objective/entropy": 103.03892517089844, + "objective/kl": 14.831886291503906, + "objective/non_score_reward": -1.4831887483596802, + "objective/rlhf_reward": -3.009036113263342, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 112.13179779052734, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9329808950424194, + "step": 1084, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000717639923096 + }, + { + "episode": 17376, + "epoch": 0.3123269942840709, + "loss/policy_avg": 0.050925977528095245, + "lr": 2.7919861963190185e-06, + "objective/entropy": 285.0555419921875, + "objective/kl": 12.683483123779297, + "objective/non_score_reward": -1.2683483362197876, + "objective/rlhf_reward": -7.07339334487915, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.9751580953598022, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.731339156627655, + "step": 1085, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00105881690979 + }, + { + "episode": 17392, + "epoch": 0.31261458820145954, + "loss/policy_avg": 0.2870730757713318, + "lr": 2.7917944785276073e-06, + "objective/entropy": 179.18856811523438, + "objective/kl": 21.071481704711914, + "objective/non_score_reward": -2.1071481704711914, + "objective/rlhf_reward": -8.028593218326568, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.975924491882324, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.758672833442688, + "step": 1086, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997131824493408 + }, + { + "episode": 17408, + "epoch": 0.3129021821188482, + "loss/policy_avg": 0.15941983461380005, + "lr": 2.7916027607361965e-06, + "objective/entropy": 121.9920425415039, + "objective/kl": 12.823822975158691, + "objective/non_score_reward": -1.2823822498321533, + "objective/rlhf_reward": -4.729529297351837, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.64611053466797, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6388263702392578, + "step": 1087, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00076961517334 + }, + { + "episode": 17424, + "epoch": 0.3131897760362368, + "loss/policy_avg": -0.06212315335869789, + "lr": 2.7914110429447853e-06, + "objective/entropy": 38.291786193847656, + "objective/kl": 4.688940525054932, + "objective/non_score_reward": -0.4688940942287445, + "objective/rlhf_reward": -1.4755764067173003, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.398519515991211, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6102602481842041, + "step": 1088, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.00457501411438 + }, + { + "episode": 17440, + "epoch": 0.31347736995362546, + "loss/policy_avg": 0.9298348426818848, + "lr": 2.791219325153374e-06, + "objective/entropy": 100.21686553955078, + "objective/kl": 13.85842227935791, + "objective/non_score_reward": -1.3858420848846436, + "objective/rlhf_reward": -3.1433682352304455, + "objective/scores": 0.6, + "policy/approxkl_avg": 33.18985366821289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6664258241653442, + "step": 1089, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988486766815186 + }, + { + "episode": 17456, + "epoch": 0.3137649638710141, + "loss/policy_avg": 0.44839587807655334, + "lr": 2.7910276073619634e-06, + "objective/entropy": -5.302301406860352, + "objective/kl": 11.534767150878906, + "objective/non_score_reward": -1.1534767150878906, + "objective/rlhf_reward": -2.789078231128763, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 26.897872924804688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6113318204879761, + "step": 1090, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999198079109192 + }, + { + "episode": 17472, + "epoch": 0.3140525577884028, + "loss/policy_avg": -0.0263579823076725, + "lr": 2.790835889570552e-06, + "objective/entropy": 53.606441497802734, + "objective/kl": 11.902129173278809, + "objective/non_score_reward": -1.1902129650115967, + "objective/rlhf_reward": -6.760851860046387, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.736373901367188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.505375862121582, + "step": 1091, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991669654846191 + }, + { + "episode": 17488, + "epoch": 0.31434015170579144, + "loss/policy_avg": 0.02518617734313011, + "lr": 2.7906441717791414e-06, + "objective/entropy": 146.1632843017578, + "objective/kl": 13.355687141418457, + "objective/non_score_reward": -1.335568904876709, + "objective/rlhf_reward": -0.9422759175300595, + "objective/scores": 1.1, + "policy/approxkl_avg": 39.9644889831543, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6409592032432556, + "step": 1092, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996291399002075 + }, + { + "episode": 17504, + "epoch": 0.3146277456231801, + "loss/policy_avg": 0.06718438118696213, + "lr": 2.79045245398773e-06, + "objective/entropy": 31.78778076171875, + "objective/kl": 16.474475860595703, + "objective/non_score_reward": -1.6474474668502808, + "objective/rlhf_reward": -6.189789867401123, + "objective/scores": 0.1, + "policy/approxkl_avg": 38.23661804199219, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5104482769966125, + "step": 1093, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979634284973145 + }, + { + "episode": 17520, + "epoch": 0.3149153395405687, + "loss/policy_avg": 0.013752680271863937, + "lr": 2.790260736196319e-06, + "objective/entropy": -115.1087646484375, + "objective/kl": 15.564956665039062, + "objective/non_score_reward": -1.5564956665039062, + "objective/rlhf_reward": -5.825982904434204, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.01404571533203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8566933274269104, + "step": 1094, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998520851135254 + }, + { + "episode": 17536, + "epoch": 0.31520293345795736, + "loss/policy_avg": 0.4633193016052246, + "lr": 2.7900690184049083e-06, + "objective/entropy": 209.13702392578125, + "objective/kl": 13.725728034973145, + "objective/non_score_reward": -1.372572898864746, + "objective/rlhf_reward": -5.090291237831115, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.772090911865234, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7493543028831482, + "step": 1095, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992456436157227 + }, + { + "episode": 17552, + "epoch": 0.315490527375346, + "loss/policy_avg": -0.17638610303401947, + "lr": 2.789877300613497e-06, + "objective/entropy": 143.67054748535156, + "objective/kl": 10.347640037536621, + "objective/non_score_reward": -1.0347639322280884, + "objective/rlhf_reward": 0.2609441518783573, + "objective/scores": 1.1, + "policy/approxkl_avg": 35.974178314208984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6519808173179626, + "step": 1096, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.00136399269104 + }, + { + "episode": 17568, + "epoch": 0.31577812129273464, + "loss/policy_avg": 0.25082990527153015, + "lr": 2.789685582822086e-06, + "objective/entropy": 135.237060546875, + "objective/kl": 11.404836654663086, + "objective/non_score_reward": -1.1404837369918823, + "objective/rlhf_reward": -6.5619354248046875, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.249817848205566, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.39255958795547485, + "step": 1097, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993515014648438 + }, + { + "episode": 17584, + "epoch": 0.31606571521012333, + "loss/policy_avg": 0.24139416217803955, + "lr": 2.789493865030675e-06, + "objective/entropy": 52.1706428527832, + "objective/kl": 11.25928783416748, + "objective/non_score_reward": -1.1259288787841797, + "objective/rlhf_reward": -4.103715336322784, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.120098114013672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49053245782852173, + "step": 1098, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.00132417678833 + }, + { + "episode": 17600, + "epoch": 0.316353309127512, + "loss/policy_avg": 0.09163187444210052, + "lr": 2.789302147239264e-06, + "objective/entropy": -32.953826904296875, + "objective/kl": 13.274989128112793, + "objective/non_score_reward": -1.3274990320205688, + "objective/rlhf_reward": -7.309996128082275, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.395530700683594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6640526652336121, + "step": 1099, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997539758682251 + }, + { + "episode": 17616, + "epoch": 0.3166409030449006, + "loss/policy_avg": 0.04822006821632385, + "lr": 2.789110429447853e-06, + "objective/entropy": 137.935791015625, + "objective/kl": 16.86005973815918, + "objective/non_score_reward": -1.6860061883926392, + "objective/rlhf_reward": -8.744024276733398, + "objective/scores": -0.5, + "policy/approxkl_avg": 82.54655456542969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9151659607887268, + "step": 1100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996995449066162 + }, + { + "episode": 17632, + "epoch": 0.31692849696228925, + "loss/policy_avg": 0.3273594379425049, + "lr": 2.7889187116564415e-06, + "objective/entropy": -49.96745300292969, + "objective/kl": 14.063299179077148, + "objective/non_score_reward": -1.4063299894332886, + "objective/rlhf_reward": -5.225319838523864, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.512229919433594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7904208898544312, + "step": 1101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9996654987335205 + }, + { + "episode": 17648, + "epoch": 0.3172160908796779, + "loss/policy_avg": 4.8567938804626465, + "lr": 2.7887269938650308e-06, + "objective/entropy": 26.933815002441406, + "objective/kl": 13.435043334960938, + "objective/non_score_reward": -1.3435043096542358, + "objective/rlhf_reward": -0.974017313122749, + "objective/scores": 1.1, + "policy/approxkl_avg": 59.44342041015625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7176166772842407, + "step": 1102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0019140243530273 + }, + { + "episode": 17664, + "epoch": 0.31750368479706653, + "loss/policy_avg": 0.5443607568740845, + "lr": 2.7885352760736196e-06, + "objective/entropy": 148.4956817626953, + "objective/kl": 14.069759368896484, + "objective/non_score_reward": -1.4069759845733643, + "objective/rlhf_reward": -3.9660444311505425, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 58.521888732910156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6067029237747192, + "step": 1103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998931884765625 + }, + { + "episode": 17680, + "epoch": 0.31779127871445517, + "loss/policy_avg": 0.1637672483921051, + "lr": 2.7883435582822084e-06, + "objective/entropy": 151.56597900390625, + "objective/kl": 12.996841430664062, + "objective/non_score_reward": -1.2996841669082642, + "objective/rlhf_reward": -7.198736667633057, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.401649475097656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6609679460525513, + "step": 1104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000152587890625 + }, + { + "episode": 17696, + "epoch": 0.3180788726318438, + "loss/policy_avg": 0.23036092519760132, + "lr": 2.7881518404907976e-06, + "objective/entropy": -163.6939697265625, + "objective/kl": 11.472090721130371, + "objective/non_score_reward": -1.1472091674804688, + "objective/rlhf_reward": -4.188836640119552, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.7734489440918, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7076303362846375, + "step": 1105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997105598449707 + }, + { + "episode": 17712, + "epoch": 0.3183664665492325, + "loss/policy_avg": 0.19628867506980896, + "lr": 2.7879601226993864e-06, + "objective/entropy": 42.15092468261719, + "objective/kl": 12.80676555633545, + "objective/non_score_reward": -1.2806766033172607, + "objective/rlhf_reward": -7.122706413269043, + "objective/scores": -0.5, + "policy/approxkl_avg": 41.839134216308594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6349719762802124, + "step": 1106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971110820770264 + }, + { + "episode": 17728, + "epoch": 0.31865406046662115, + "loss/policy_avg": 0.08596844226121902, + "lr": 2.7877684049079757e-06, + "objective/entropy": 103.33952331542969, + "objective/kl": 14.068815231323242, + "objective/non_score_reward": -1.4068814516067505, + "objective/rlhf_reward": -5.227526044845581, + "objective/scores": 0.1, + "policy/approxkl_avg": 44.43631362915039, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5943961143493652, + "step": 1107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989540576934814 + }, + { + "episode": 17744, + "epoch": 0.3189416543840098, + "loss/policy_avg": -0.21316379308700562, + "lr": 2.7875766871165645e-06, + "objective/entropy": 90.92996978759766, + "objective/kl": 9.894368171691895, + "objective/non_score_reward": -0.9894368052482605, + "objective/rlhf_reward": -1.557747161388397, + "objective/scores": 0.6, + "policy/approxkl_avg": 8.975800514221191, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5788801908493042, + "step": 1108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.013200521469116 + }, + { + "episode": 17760, + "epoch": 0.3192292483013984, + "loss/policy_avg": 0.11199735105037689, + "lr": 2.7873849693251533e-06, + "objective/entropy": 234.6155548095703, + "objective/kl": 15.355046272277832, + "objective/non_score_reward": -1.5355048179626465, + "objective/rlhf_reward": -4.317190463813851, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 10.193365097045898, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7720480561256409, + "step": 1109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996470212936401 + }, + { + "episode": 17776, + "epoch": 0.31951684221878707, + "loss/policy_avg": 0.39977481961250305, + "lr": 2.7871932515337425e-06, + "objective/entropy": -96.6143569946289, + "objective/kl": 21.307180404663086, + "objective/non_score_reward": -2.1307179927825928, + "objective/rlhf_reward": -6.96661278506811, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 83.29231262207031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5110878944396973, + "step": 1110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002777576446533 + }, + { + "episode": 17792, + "epoch": 0.3198044361361757, + "loss/policy_avg": 0.21299278736114502, + "lr": 2.7870015337423313e-06, + "objective/entropy": -37.447601318359375, + "objective/kl": 13.953254699707031, + "objective/non_score_reward": -1.3953255414962769, + "objective/rlhf_reward": -7.581302165985107, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.7848014831543, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.45726099610328674, + "step": 1111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998581409454346 + }, + { + "episode": 17808, + "epoch": 0.32009203005356435, + "loss/policy_avg": 0.8188914656639099, + "lr": 2.78680981595092e-06, + "objective/entropy": 115.01608276367188, + "objective/kl": 13.266737937927246, + "objective/non_score_reward": -1.3266738653182983, + "objective/rlhf_reward": -4.906695282459259, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.617708206176758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6811967492103577, + "step": 1112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996246099472046 + }, + { + "episode": 17824, + "epoch": 0.320379623970953, + "loss/policy_avg": 0.4930746555328369, + "lr": 2.7866180981595094e-06, + "objective/entropy": -53.15104675292969, + "objective/kl": 19.036401748657227, + "objective/non_score_reward": -1.9036401510238647, + "objective/rlhf_reward": -5.491854207889114, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 156.63845825195312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6161982417106628, + "step": 1113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9972939491271973 + }, + { + "episode": 17840, + "epoch": 0.3206672178883417, + "loss/policy_avg": -0.006604592781513929, + "lr": 2.786426380368098e-06, + "objective/entropy": 11.049800872802734, + "objective/kl": 15.964569091796875, + "objective/non_score_reward": -1.59645676612854, + "objective/rlhf_reward": -5.985827451944351, + "objective/scores": 0.1, + "policy/approxkl_avg": 111.54328155517578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6989386081695557, + "step": 1114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984581470489502 + }, + { + "episode": 17856, + "epoch": 0.3209548118057303, + "loss/policy_avg": 0.34559106826782227, + "lr": 2.7862346625766874e-06, + "objective/entropy": 62.411766052246094, + "objective/kl": 14.781805992126465, + "objective/non_score_reward": -1.4781804084777832, + "objective/rlhf_reward": -7.912721633911133, + "objective/scores": -0.5, + "policy/approxkl_avg": 61.101905822753906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7949924468994141, + "step": 1115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982895851135254 + }, + { + "episode": 17872, + "epoch": 0.32124240572311896, + "loss/policy_avg": 0.17752394080162048, + "lr": 2.7860429447852762e-06, + "objective/entropy": -58.06376647949219, + "objective/kl": 11.518682479858398, + "objective/non_score_reward": -1.151868224143982, + "objective/rlhf_reward": -4.20747292637825, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.080097198486328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6485680937767029, + "step": 1116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0006484985351562 + }, + { + "episode": 17888, + "epoch": 0.3215299996405076, + "loss/policy_avg": -0.11512506008148193, + "lr": 2.785851226993865e-06, + "objective/entropy": 34.327606201171875, + "objective/kl": 6.476920127868652, + "objective/non_score_reward": -0.6476920247077942, + "objective/rlhf_reward": -4.590767860412598, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.352293014526367, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6617291569709778, + "step": 1117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993085861206055 + }, + { + "episode": 17904, + "epoch": 0.32181759355789624, + "loss/policy_avg": 0.160440132021904, + "lr": 2.7856595092024543e-06, + "objective/entropy": 60.79460906982422, + "objective/kl": 10.265054702758789, + "objective/non_score_reward": -1.0265053510665894, + "objective/rlhf_reward": -3.706021463871002, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.06240463256836, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6272501945495605, + "step": 1118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996984004974365 + }, + { + "episode": 17920, + "epoch": 0.3221051874752849, + "loss/policy_avg": 0.043839361518621445, + "lr": 2.785467791411043e-06, + "objective/entropy": 38.43065643310547, + "objective/kl": 5.596960067749023, + "objective/non_score_reward": -0.5596960783004761, + "objective/rlhf_reward": -4.238784313201904, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.959665298461914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7662742137908936, + "step": 1119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9981067180633545 + }, + { + "episode": 17936, + "epoch": 0.3223927813926735, + "loss/policy_avg": 0.08080266416072845, + "lr": 2.7852760736196323e-06, + "objective/entropy": -86.41799926757812, + "objective/kl": 9.186336517333984, + "objective/non_score_reward": -0.9186336398124695, + "objective/rlhf_reward": -0.7508156045686928, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 47.581703186035156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.609947681427002, + "step": 1120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973444938659668 + }, + { + "episode": 17952, + "epoch": 0.3226803753100622, + "loss/policy_avg": 0.4055587947368622, + "lr": 2.785084355828221e-06, + "objective/entropy": 73.900390625, + "objective/kl": 15.857680320739746, + "objective/non_score_reward": -1.5857681035995483, + "objective/rlhf_reward": -1.9430725932121273, + "objective/scores": 1.1, + "policy/approxkl_avg": 56.220314025878906, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5025008916854858, + "step": 1121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0059289932250977 + }, + { + "episode": 17968, + "epoch": 0.32296796922745086, + "loss/policy_avg": -0.31560468673706055, + "lr": 2.78489263803681e-06, + "objective/entropy": 112.73280334472656, + "objective/kl": 10.532523155212402, + "objective/non_score_reward": -1.0532522201538086, + "objective/rlhf_reward": 0.18699111938476598, + "objective/scores": 1.1, + "policy/approxkl_avg": 13.733489990234375, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.8091140985488892, + "step": 1122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0048389434814453 + }, + { + "episode": 17984, + "epoch": 0.3232555631448395, + "loss/policy_avg": 0.011030721478164196, + "lr": 2.7847009202453987e-06, + "objective/entropy": 20.075477600097656, + "objective/kl": 13.247042655944824, + "objective/non_score_reward": -1.3247044086456299, + "objective/rlhf_reward": -4.898817485570907, + "objective/scores": 0.1, + "policy/approxkl_avg": 54.85459899902344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5287940502166748, + "step": 1123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001558303833008 + }, + { + "episode": 18000, + "epoch": 0.32354315706222814, + "loss/policy_avg": 0.34385043382644653, + "lr": 2.7845092024539875e-06, + "objective/entropy": 26.090667724609375, + "objective/kl": 16.442066192626953, + "objective/non_score_reward": -1.6442067623138428, + "objective/rlhf_reward": -8.576827049255371, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.108585357666016, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6301190853118896, + "step": 1124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9967900514602661 + }, + { + "episode": 18016, + "epoch": 0.3238307509796168, + "loss/policy_avg": 0.019408032298088074, + "lr": 2.7843174846625768e-06, + "objective/entropy": 56.218666076660156, + "objective/kl": 21.929851531982422, + "objective/non_score_reward": -2.1929850578308105, + "objective/rlhf_reward": -10.771940231323242, + "objective/scores": -0.5, + "policy/approxkl_avg": 70.08192443847656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7020080089569092, + "step": 1125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998396635055542 + }, + { + "episode": 18032, + "epoch": 0.3241183448970054, + "loss/policy_avg": -0.09649811685085297, + "lr": 2.7841257668711656e-06, + "objective/entropy": 111.22513580322266, + "objective/kl": 7.8993682861328125, + "objective/non_score_reward": -0.7899367809295654, + "objective/rlhf_reward": 1.2402529209852222, + "objective/scores": 1.1, + "policy/approxkl_avg": 15.892783164978027, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5796664357185364, + "step": 1126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0133345127105713 + }, + { + "episode": 18048, + "epoch": 0.32440593881439406, + "loss/policy_avg": 0.012353288009762764, + "lr": 2.7839340490797544e-06, + "objective/entropy": 35.718894958496094, + "objective/kl": 12.086095809936523, + "objective/non_score_reward": -1.2086095809936523, + "objective/rlhf_reward": -3.2303187585511974, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 25.718326568603516, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6076189279556274, + "step": 1127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0017600059509277 + }, + { + "episode": 18064, + "epoch": 0.3246935327317827, + "loss/policy_avg": 0.3458269238471985, + "lr": 2.7837423312883436e-06, + "objective/entropy": 187.83889770507812, + "objective/kl": 11.952428817749023, + "objective/non_score_reward": -1.1952428817749023, + "objective/rlhf_reward": -3.0476382980744043, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 6.217028617858887, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5002806186676025, + "step": 1128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992327690124512 + }, + { + "episode": 18080, + "epoch": 0.3249811266491714, + "loss/policy_avg": 0.142012819647789, + "lr": 2.7835506134969324e-06, + "objective/entropy": -29.551471710205078, + "objective/kl": 11.834535598754883, + "objective/non_score_reward": -1.1834536790847778, + "objective/rlhf_reward": -2.7864033979939773, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 20.23397445678711, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5848058462142944, + "step": 1129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9986717700958252 + }, + { + "episode": 18096, + "epoch": 0.32526872056656003, + "loss/policy_avg": 0.004172764718532562, + "lr": 2.7833588957055217e-06, + "objective/entropy": -160.0207977294922, + "objective/kl": 14.571106910705566, + "objective/non_score_reward": -1.4571107625961304, + "objective/rlhf_reward": -4.095109657446543, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 59.20903015136719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7981671094894409, + "step": 1130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990029335021973 + }, + { + "episode": 18112, + "epoch": 0.32555631448394867, + "loss/policy_avg": 0.15084603428840637, + "lr": 2.7831671779141105e-06, + "objective/entropy": 96.68032836914062, + "objective/kl": 13.1805419921875, + "objective/non_score_reward": -1.318053960800171, + "objective/rlhf_reward": -3.1495099685349803, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 25.39088249206543, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7604236602783203, + "step": 1131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000643730163574 + }, + { + "episode": 18128, + "epoch": 0.3258439084013373, + "loss/policy_avg": 0.19803990423679352, + "lr": 2.7829754601226993e-06, + "objective/entropy": 13.389362335205078, + "objective/kl": 16.05734634399414, + "objective/non_score_reward": -1.6057347059249878, + "objective/rlhf_reward": -8.42293930053711, + "objective/scores": -0.5, + "policy/approxkl_avg": 133.68670654296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7763709425926208, + "step": 1132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998838901519775 + }, + { + "episode": 18144, + "epoch": 0.32613150231872595, + "loss/policy_avg": -0.009128901176154613, + "lr": 2.7827837423312885e-06, + "objective/entropy": -21.005226135253906, + "objective/kl": 17.77182388305664, + "objective/non_score_reward": -1.7771823406219482, + "objective/rlhf_reward": -4.1850105866205425, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 100.30049896240234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5907350778579712, + "step": 1133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987881183624268 + }, + { + "episode": 18160, + "epoch": 0.3264190962361146, + "loss/policy_avg": 0.21724221110343933, + "lr": 2.7825920245398773e-06, + "objective/entropy": -26.32037353515625, + "objective/kl": 18.74835205078125, + "objective/non_score_reward": -1.8748352527618408, + "objective/rlhf_reward": -7.099340951442718, + "objective/scores": 0.1, + "policy/approxkl_avg": 106.53759765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.843207836151123, + "step": 1134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977933168411255 + }, + { + "episode": 18176, + "epoch": 0.32670669015350323, + "loss/policy_avg": 0.2329787313938141, + "lr": 2.7824003067484666e-06, + "objective/entropy": 129.7427978515625, + "objective/kl": 12.383695602416992, + "objective/non_score_reward": -1.2383698225021362, + "objective/rlhf_reward": -2.830773057714973, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 47.80897521972656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6198921203613281, + "step": 1135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9971883296966553 + }, + { + "episode": 18192, + "epoch": 0.3269942840708919, + "loss/policy_avg": 0.40199702978134155, + "lr": 2.7822085889570554e-06, + "objective/entropy": 108.27337646484375, + "objective/kl": 10.543985366821289, + "objective/non_score_reward": -1.0543984174728394, + "objective/rlhf_reward": -6.217593669891357, + "objective/scores": -0.5, + "policy/approxkl_avg": 57.99873352050781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5256673097610474, + "step": 1136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000418186187744 + }, + { + "episode": 18208, + "epoch": 0.32728187798828057, + "loss/policy_avg": 0.06313613057136536, + "lr": 2.782016871165644e-06, + "objective/entropy": 143.59881591796875, + "objective/kl": 13.319164276123047, + "objective/non_score_reward": -1.3319165706634521, + "objective/rlhf_reward": -7.327666282653809, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.44135284423828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5660614967346191, + "step": 1137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981001615524292 + }, + { + "episode": 18224, + "epoch": 0.3275694719056692, + "loss/policy_avg": 0.302053838968277, + "lr": 2.7818251533742334e-06, + "objective/entropy": 220.9147186279297, + "objective/kl": 15.305615425109863, + "objective/non_score_reward": -1.5305616855621338, + "objective/rlhf_reward": -4.518126699987965, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 81.97419738769531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8646837472915649, + "step": 1138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971197843551636 + }, + { + "episode": 18240, + "epoch": 0.32785706582305785, + "loss/policy_avg": 1.9834920167922974, + "lr": 2.7816334355828222e-06, + "objective/entropy": 120.74797821044922, + "objective/kl": 13.918863296508789, + "objective/non_score_reward": -1.3918863534927368, + "objective/rlhf_reward": -5.167545443773269, + "objective/scores": 0.1, + "policy/approxkl_avg": 63.093605041503906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6175976991653442, + "step": 1139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0009703636169434 + }, + { + "episode": 18256, + "epoch": 0.3281446597404465, + "loss/policy_avg": 0.7380589842796326, + "lr": 2.781441717791411e-06, + "objective/entropy": -91.15982055664062, + "objective/kl": 11.536355018615723, + "objective/non_score_reward": -1.1536355018615723, + "objective/rlhf_reward": -3.010421965185719, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 42.84178161621094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5434945821762085, + "step": 1140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000872850418091 + }, + { + "episode": 18272, + "epoch": 0.3284322536578351, + "loss/policy_avg": 0.3735503554344177, + "lr": 2.7812500000000003e-06, + "objective/entropy": 37.28156280517578, + "objective/kl": 13.715288162231445, + "objective/non_score_reward": -1.3715288639068604, + "objective/rlhf_reward": -7.486115455627441, + "objective/scores": -0.5, + "policy/approxkl_avg": 58.20576095581055, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7024419903755188, + "step": 1141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980452060699463 + }, + { + "episode": 18288, + "epoch": 0.32871984757522377, + "loss/policy_avg": 0.10260803997516632, + "lr": 2.781058282208589e-06, + "objective/entropy": -110.40155029296875, + "objective/kl": 10.552820205688477, + "objective/non_score_reward": -1.0552821159362793, + "objective/rlhf_reward": -3.8211283445358273, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.53842544555664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.618804931640625, + "step": 1142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0004189014434814 + }, + { + "episode": 18304, + "epoch": 0.3290074414926124, + "loss/policy_avg": -0.2922723889350891, + "lr": 2.7808665644171783e-06, + "objective/entropy": 43.667179107666016, + "objective/kl": 15.767911911010742, + "objective/non_score_reward": -1.576791524887085, + "objective/rlhf_reward": -5.907165741920471, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.228885650634766, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7186756134033203, + "step": 1143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0082926750183105 + }, + { + "episode": 18320, + "epoch": 0.3292950354100011, + "loss/policy_avg": 0.15897995233535767, + "lr": 2.780674846625767e-06, + "objective/entropy": 28.125289916992188, + "objective/kl": 16.386962890625, + "objective/non_score_reward": -1.638696312904358, + "objective/rlhf_reward": -8.554784774780273, + "objective/scores": -0.5, + "policy/approxkl_avg": 72.38021850585938, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.663409948348999, + "step": 1144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980506896972656 + }, + { + "episode": 18336, + "epoch": 0.32958262932738974, + "loss/policy_avg": 0.10999254137277603, + "lr": 2.780483128834356e-06, + "objective/entropy": -168.7968292236328, + "objective/kl": 4.2082414627075195, + "objective/non_score_reward": -0.4208241403102875, + "objective/rlhf_reward": -1.28329656124115, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.389283180236816, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.515932559967041, + "step": 1145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000765085220337 + }, + { + "episode": 18352, + "epoch": 0.3298702232447784, + "loss/policy_avg": 0.08765261620283127, + "lr": 2.7802914110429447e-06, + "objective/entropy": -20.226409912109375, + "objective/kl": 21.756072998046875, + "objective/non_score_reward": -2.175607442855835, + "objective/rlhf_reward": -8.302429950237274, + "objective/scores": 0.1, + "policy/approxkl_avg": 67.168212890625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5186696648597717, + "step": 1146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991587400436401 + }, + { + "episode": 18368, + "epoch": 0.330157817162167, + "loss/policy_avg": 0.15273040533065796, + "lr": 2.7800996932515335e-06, + "objective/entropy": 31.48741912841797, + "objective/kl": 19.75265884399414, + "objective/non_score_reward": -1.975265622138977, + "objective/rlhf_reward": -3.5010626077651974, + "objective/scores": 1.1, + "policy/approxkl_avg": 20.612037658691406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4049757719039917, + "step": 1147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988133907318115 + }, + { + "episode": 18384, + "epoch": 0.33044541107955566, + "loss/policy_avg": -0.058190956711769104, + "lr": 2.7799079754601228e-06, + "objective/entropy": -115.31792449951172, + "objective/kl": 17.150239944458008, + "objective/non_score_reward": -1.7150239944458008, + "objective/rlhf_reward": -2.4600957393646237, + "objective/scores": 1.1, + "policy/approxkl_avg": 95.23487854003906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.754326343536377, + "step": 1148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9976162910461426 + }, + { + "episode": 18400, + "epoch": 0.3307330049969443, + "loss/policy_avg": 0.23397204279899597, + "lr": 2.7797162576687116e-06, + "objective/entropy": 135.87664794921875, + "objective/kl": 21.47640609741211, + "objective/non_score_reward": -2.1476407051086426, + "objective/rlhf_reward": -8.190562522411346, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.630273818969727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7489320039749146, + "step": 1149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979724884033203 + }, + { + "episode": 18416, + "epoch": 0.33102059891433294, + "loss/policy_avg": 0.2109401971101761, + "lr": 2.7795245398773004e-06, + "objective/entropy": -96.35713195800781, + "objective/kl": 7.339359283447266, + "objective/non_score_reward": -0.7339359521865845, + "objective/rlhf_reward": 1.4642561316490177, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.709714889526367, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5943701863288879, + "step": 1150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000924587249756 + }, + { + "episode": 18432, + "epoch": 0.3313081928317216, + "loss/policy_avg": 0.27022865414619446, + "lr": 2.7793328220858896e-06, + "objective/entropy": -70.71237182617188, + "objective/kl": 12.857504844665527, + "objective/non_score_reward": -1.2857506275177002, + "objective/rlhf_reward": -4.7430025100708, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.81177520751953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8109536170959473, + "step": 1151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997032880783081 + }, + { + "episode": 18448, + "epoch": 0.3315957867491103, + "loss/policy_avg": -0.1951054185628891, + "lr": 2.7791411042944784e-06, + "objective/entropy": -112.49846649169922, + "objective/kl": 10.639480590820312, + "objective/non_score_reward": -1.0639480352401733, + "objective/rlhf_reward": -2.6995326717763692, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.8408970832824707, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4948251247406006, + "step": 1152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0016403198242188 + }, + { + "episode": 18464, + "epoch": 0.3318833806664989, + "loss/policy_avg": 0.6007785797119141, + "lr": 2.7789493865030677e-06, + "objective/entropy": 2.571552276611328, + "objective/kl": 17.329076766967773, + "objective/non_score_reward": -1.732907772064209, + "objective/rlhf_reward": -2.5316311478614804, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.923664093017578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6116815805435181, + "step": 1153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988734722137451 + }, + { + "episode": 18480, + "epoch": 0.33217097458388756, + "loss/policy_avg": 1.0791575908660889, + "lr": 2.7787576687116565e-06, + "objective/entropy": 186.5494384765625, + "objective/kl": 21.728107452392578, + "objective/non_score_reward": -2.1728107929229736, + "objective/rlhf_reward": -6.291242933273315, + "objective/scores": 0.6, + "policy/approxkl_avg": 129.72695922851562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6914939284324646, + "step": 1154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973831176757812 + }, + { + "episode": 18496, + "epoch": 0.3324585685012762, + "loss/policy_avg": 0.3817293345928192, + "lr": 2.7785659509202453e-06, + "objective/entropy": -144.4022216796875, + "objective/kl": 14.357304573059082, + "objective/non_score_reward": -1.4357304573059082, + "objective/rlhf_reward": -7.742921829223633, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.8433094024658203, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6484241485595703, + "step": 1155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001634120941162 + }, + { + "episode": 18512, + "epoch": 0.33274616241866484, + "loss/policy_avg": 0.34997937083244324, + "lr": 2.7783742331288345e-06, + "objective/entropy": -131.33566284179688, + "objective/kl": 13.435831069946289, + "objective/non_score_reward": -1.343583345413208, + "objective/rlhf_reward": -4.974333143234253, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.9140739440918, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6474661231040955, + "step": 1156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971636533737183 + }, + { + "episode": 18528, + "epoch": 0.3330337563360535, + "loss/policy_avg": 0.39980053901672363, + "lr": 2.7781825153374233e-06, + "objective/entropy": -103.27327728271484, + "objective/kl": 17.875957489013672, + "objective/non_score_reward": -1.787595510482788, + "objective/rlhf_reward": -6.750381997227668, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.51301574707031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6814613938331604, + "step": 1157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991803169250488 + }, + { + "episode": 18544, + "epoch": 0.3333213502534421, + "loss/policy_avg": 0.3572099804878235, + "lr": 2.7779907975460126e-06, + "objective/entropy": -19.225521087646484, + "objective/kl": 13.611352920532227, + "objective/non_score_reward": -1.3611352443695068, + "objective/rlhf_reward": -5.044541215896606, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.89737892150879, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7011643648147583, + "step": 1158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996830224990845 + }, + { + "episode": 18560, + "epoch": 0.3336089441708308, + "loss/policy_avg": 0.35091787576675415, + "lr": 2.7777990797546014e-06, + "objective/entropy": -124.19099426269531, + "objective/kl": 8.701774597167969, + "objective/non_score_reward": -0.8701775074005127, + "objective/rlhf_reward": -3.080709910392761, + "objective/scores": 0.1, + "policy/approxkl_avg": 71.30574035644531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5850265026092529, + "step": 1159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987276792526245 + }, + { + "episode": 18576, + "epoch": 0.33389653808821945, + "loss/policy_avg": 0.10446971654891968, + "lr": 2.77760736196319e-06, + "objective/entropy": 303.48541259765625, + "objective/kl": 14.655348777770996, + "objective/non_score_reward": -1.4655349254608154, + "objective/rlhf_reward": -7.862139701843262, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.810016632080078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9821838736534119, + "step": 1160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996795654296875 + }, + { + "episode": 18592, + "epoch": 0.3341841320056081, + "loss/policy_avg": 0.10929669439792633, + "lr": 2.7774156441717794e-06, + "objective/entropy": -19.155555725097656, + "objective/kl": 12.892605781555176, + "objective/non_score_reward": -1.2892606258392334, + "objective/rlhf_reward": -3.209631155209477, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 19.19078826904297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5724284648895264, + "step": 1161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9966273307800293 + }, + { + "episode": 18608, + "epoch": 0.33447172592299673, + "loss/policy_avg": 0.31029993295669556, + "lr": 2.7772239263803682e-06, + "objective/entropy": 104.80690002441406, + "objective/kl": 18.291303634643555, + "objective/non_score_reward": -1.8291301727294922, + "objective/rlhf_reward": -4.392801736236784, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 215.850830078125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6571725606918335, + "step": 1162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998605728149414 + }, + { + "episode": 18624, + "epoch": 0.33475931984038537, + "loss/policy_avg": -0.5111748576164246, + "lr": 2.777032208588957e-06, + "objective/entropy": -37.404815673828125, + "objective/kl": 12.505797386169434, + "objective/non_score_reward": -1.250579833984375, + "objective/rlhf_reward": -4.60231921672821, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.684650421142578, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.628068208694458, + "step": 1163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0031986236572266 + }, + { + "episode": 18640, + "epoch": 0.335046913757774, + "loss/policy_avg": 0.5985330939292908, + "lr": 2.7768404907975463e-06, + "objective/entropy": 85.2934799194336, + "objective/kl": 21.97635269165039, + "objective/non_score_reward": -2.1976354122161865, + "objective/rlhf_reward": -8.39054157435894, + "objective/scores": 0.1, + "policy/approxkl_avg": 46.09657669067383, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8425604104995728, + "step": 1164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9965444803237915 + }, + { + "episode": 18656, + "epoch": 0.33533450767516265, + "loss/policy_avg": -0.33150577545166016, + "lr": 2.776648773006135e-06, + "objective/entropy": 17.754005432128906, + "objective/kl": 11.162395477294922, + "objective/non_score_reward": -1.1162395477294922, + "objective/rlhf_reward": -6.464958667755127, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.122459411621094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8440073132514954, + "step": 1165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0027763843536377 + }, + { + "episode": 18672, + "epoch": 0.3356221015925513, + "loss/policy_avg": 0.5681189298629761, + "lr": 2.7764570552147243e-06, + "objective/entropy": 174.1182861328125, + "objective/kl": 13.990375518798828, + "objective/non_score_reward": -1.399037480354309, + "objective/rlhf_reward": -7.596149921417236, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.488868713378906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4357028007507324, + "step": 1166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000075340270996 + }, + { + "episode": 18688, + "epoch": 0.33590969550994, + "loss/policy_avg": 0.6947311162948608, + "lr": 2.7762653374233127e-06, + "objective/entropy": -161.28807067871094, + "objective/kl": 13.998380661010742, + "objective/non_score_reward": -1.3998382091522217, + "objective/rlhf_reward": -1.199352583289146, + "objective/scores": 1.1, + "policy/approxkl_avg": 57.86552429199219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6250740885734558, + "step": 1167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996944665908813 + }, + { + "episode": 18704, + "epoch": 0.3361972894273286, + "loss/policy_avg": 0.1502753645181656, + "lr": 2.776073619631902e-06, + "objective/entropy": 189.06851196289062, + "objective/kl": 6.054288864135742, + "objective/non_score_reward": -0.6054288148880005, + "objective/rlhf_reward": -2.021715438365936, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.8485565185546875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48803550004959106, + "step": 1168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998919129371643 + }, + { + "episode": 18720, + "epoch": 0.33648488334471727, + "loss/policy_avg": 0.42749345302581787, + "lr": 2.7758819018404907e-06, + "objective/entropy": -281.3139953613281, + "objective/kl": 12.804277420043945, + "objective/non_score_reward": -1.2804276943206787, + "objective/rlhf_reward": -4.721710836887359, + "objective/scores": 0.1, + "policy/approxkl_avg": 67.71830749511719, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7152395844459534, + "step": 1169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9967622756958008 + }, + { + "episode": 18736, + "epoch": 0.3367724772621059, + "loss/policy_avg": 0.06842047721147537, + "lr": 2.7756901840490796e-06, + "objective/entropy": -40.159446716308594, + "objective/kl": 17.861766815185547, + "objective/non_score_reward": -1.7861766815185547, + "objective/rlhf_reward": -6.744707024097442, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.40418243408203, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5496320724487305, + "step": 1170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001437187194824 + }, + { + "episode": 18752, + "epoch": 0.33706007117949455, + "loss/policy_avg": 0.7477625608444214, + "lr": 2.7754984662576688e-06, + "objective/entropy": -32.50873565673828, + "objective/kl": 17.935222625732422, + "objective/non_score_reward": -1.7935223579406738, + "objective/rlhf_reward": -4.774089819192886, + "objective/scores": 0.6, + "policy/approxkl_avg": 59.91670227050781, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.615089476108551, + "step": 1171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9974488019943237 + }, + { + "episode": 18768, + "epoch": 0.3373476650968832, + "loss/policy_avg": -0.10740236937999725, + "lr": 2.7753067484662576e-06, + "objective/entropy": -91.79651641845703, + "objective/kl": 12.429764747619629, + "objective/non_score_reward": -1.242976427078247, + "objective/rlhf_reward": -4.571905589103698, + "objective/scores": 0.1, + "policy/approxkl_avg": 12.65214729309082, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6722214221954346, + "step": 1172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0004234313964844 + }, + { + "episode": 18784, + "epoch": 0.3376352590142718, + "loss/policy_avg": -0.14357039332389832, + "lr": 2.775115030674847e-06, + "objective/entropy": -119.25468444824219, + "objective/kl": 11.105037689208984, + "objective/non_score_reward": -1.110503911972046, + "objective/rlhf_reward": -2.885756283011988, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.2598841190338135, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7479334473609924, + "step": 1173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0019779205322266 + }, + { + "episode": 18800, + "epoch": 0.3379228529316605, + "loss/policy_avg": 0.3568786382675171, + "lr": 2.7749233128834356e-06, + "objective/entropy": 276.316650390625, + "objective/kl": 14.202279090881348, + "objective/non_score_reward": -1.4202280044555664, + "objective/rlhf_reward": -3.5582059047379833, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 66.28675842285156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8677693605422974, + "step": 1174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986944198608398 + }, + { + "episode": 18816, + "epoch": 0.33821044684904916, + "loss/policy_avg": 0.37396079301834106, + "lr": 2.7747315950920244e-06, + "objective/entropy": 104.71287536621094, + "objective/kl": 18.649734497070312, + "objective/non_score_reward": -1.8649733066558838, + "objective/rlhf_reward": -7.059893345832824, + "objective/scores": 0.1, + "policy/approxkl_avg": 59.60235595703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.42154908180236816, + "step": 1175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960224628448486 + }, + { + "episode": 18832, + "epoch": 0.3384980407664378, + "loss/policy_avg": -0.2894097566604614, + "lr": 2.7745398773006137e-06, + "objective/entropy": -84.91337585449219, + "objective/kl": 12.799016952514648, + "objective/non_score_reward": -1.2799016237258911, + "objective/rlhf_reward": -0.7196066290140148, + "objective/scores": 1.1, + "policy/approxkl_avg": 67.74818420410156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6464543342590332, + "step": 1176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.00795316696167 + }, + { + "episode": 18848, + "epoch": 0.33878563468382644, + "loss/policy_avg": 0.3645285367965698, + "lr": 2.7743481595092025e-06, + "objective/entropy": 168.74581909179688, + "objective/kl": 17.867380142211914, + "objective/non_score_reward": -1.786738395690918, + "objective/rlhf_reward": -2.746953225135803, + "objective/scores": 1.1, + "policy/approxkl_avg": 77.44456481933594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7084821462631226, + "step": 1177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979579448699951 + }, + { + "episode": 18864, + "epoch": 0.3390732286012151, + "loss/policy_avg": 0.4001834988594055, + "lr": 2.7741564417177913e-06, + "objective/entropy": 99.16302490234375, + "objective/kl": 13.607667922973633, + "objective/non_score_reward": -1.360766887664795, + "objective/rlhf_reward": -1.0430678486824032, + "objective/scores": 1.1, + "policy/approxkl_avg": 28.074718475341797, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5342350006103516, + "step": 1178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996448755264282 + }, + { + "episode": 18880, + "epoch": 0.3393608225186037, + "loss/policy_avg": 0.10984847694635391, + "lr": 2.7739647239263805e-06, + "objective/entropy": 180.45806884765625, + "objective/kl": 11.570047378540039, + "objective/non_score_reward": -1.157004952430725, + "objective/rlhf_reward": -4.228019899129867, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.99121856689453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5318220257759094, + "step": 1179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975459575653076 + }, + { + "episode": 18896, + "epoch": 0.33964841643599236, + "loss/policy_avg": 0.3910417854785919, + "lr": 2.7737730061349693e-06, + "objective/entropy": -8.10009765625, + "objective/kl": 8.290020942687988, + "objective/non_score_reward": -0.8290020823478699, + "objective/rlhf_reward": 1.0839917749166492, + "objective/scores": 1.1, + "policy/approxkl_avg": 37.283531188964844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5130699872970581, + "step": 1180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991108179092407 + }, + { + "episode": 18912, + "epoch": 0.339936010353381, + "loss/policy_avg": 0.014388211071491241, + "lr": 2.7735812883435586e-06, + "objective/entropy": 175.02603149414062, + "objective/kl": 15.754423141479492, + "objective/non_score_reward": -1.5754423141479492, + "objective/rlhf_reward": -8.301769256591797, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.509084701538086, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7703262567520142, + "step": 1181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978232383728027 + }, + { + "episode": 18928, + "epoch": 0.3402236042707697, + "loss/policy_avg": 0.055365175008773804, + "lr": 2.7733895705521474e-06, + "objective/entropy": -334.13519287109375, + "objective/kl": 8.783041000366211, + "objective/non_score_reward": -0.878304123878479, + "objective/rlhf_reward": -3.1132164955139157, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.43748474121094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6499872207641602, + "step": 1182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.999503493309021 + }, + { + "episode": 18944, + "epoch": 0.34051119818815834, + "loss/policy_avg": 0.07409346848726273, + "lr": 2.773197852760736e-06, + "objective/entropy": -83.13888549804688, + "objective/kl": 14.876424789428711, + "objective/non_score_reward": -1.487642526626587, + "objective/rlhf_reward": -5.550569987297058, + "objective/scores": 0.1, + "policy/approxkl_avg": 71.0294189453125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8472854495048523, + "step": 1183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9958550930023193 + }, + { + "episode": 18960, + "epoch": 0.340798792105547, + "loss/policy_avg": 0.9990078806877136, + "lr": 2.7730061349693254e-06, + "objective/entropy": 20.914791107177734, + "objective/kl": 14.88058853149414, + "objective/non_score_reward": -1.4880588054656982, + "objective/rlhf_reward": -5.552235579490661, + "objective/scores": 0.1, + "policy/approxkl_avg": 18.125185012817383, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4836152493953705, + "step": 1184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983336925506592 + }, + { + "episode": 18976, + "epoch": 0.3410863860229356, + "loss/policy_avg": 0.048895176500082016, + "lr": 2.7728144171779142e-06, + "objective/entropy": 89.80201721191406, + "objective/kl": 12.485555648803711, + "objective/non_score_reward": -1.2485556602478027, + "objective/rlhf_reward": -4.594222491979599, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.1111341714859009, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4978290796279907, + "step": 1185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0011367797851562 + }, + { + "episode": 18992, + "epoch": 0.34137397994032426, + "loss/policy_avg": 0.6291418075561523, + "lr": 2.7726226993865035e-06, + "objective/entropy": 154.68283081054688, + "objective/kl": 14.19253921508789, + "objective/non_score_reward": -1.419253945350647, + "objective/rlhf_reward": -3.5543094298997264, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 38.20229721069336, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7595288753509521, + "step": 1186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998978614807129 + }, + { + "episode": 19008, + "epoch": 0.3416615738577129, + "loss/policy_avg": 0.3727290630340576, + "lr": 2.7724309815950923e-06, + "objective/entropy": 136.96429443359375, + "objective/kl": 8.332364082336426, + "objective/non_score_reward": -0.8332364559173584, + "objective/rlhf_reward": -5.332945823669434, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.576475143432617, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7260855436325073, + "step": 1187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986281394958496 + }, + { + "episode": 19024, + "epoch": 0.34194916777510154, + "loss/policy_avg": 0.2877521514892578, + "lr": 2.772239263803681e-06, + "objective/entropy": -1.9418220520019531, + "objective/kl": 14.981258392333984, + "objective/non_score_reward": -1.4981257915496826, + "objective/rlhf_reward": -3.8697967848935466, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 115.05579376220703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6772172451019287, + "step": 1188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973249435424805 + }, + { + "episode": 19040, + "epoch": 0.3422367616924902, + "loss/policy_avg": 0.3203403949737549, + "lr": 2.7720475460122703e-06, + "objective/entropy": 126.06346893310547, + "objective/kl": 14.789588928222656, + "objective/non_score_reward": -1.4789589643478394, + "objective/rlhf_reward": -5.515835797786712, + "objective/scores": 0.1, + "policy/approxkl_avg": 37.32659149169922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7956616878509521, + "step": 1189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996840238571167 + }, + { + "episode": 19056, + "epoch": 0.34252435560987887, + "loss/policy_avg": 0.1269657164812088, + "lr": 2.7718558282208587e-06, + "objective/entropy": 7.045299530029297, + "objective/kl": 11.909355163574219, + "objective/non_score_reward": -1.1909356117248535, + "objective/rlhf_reward": -4.363742417097091, + "objective/scores": 0.1, + "policy/approxkl_avg": 51.526981353759766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6353417634963989, + "step": 1190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998396635055542 + }, + { + "episode": 19072, + "epoch": 0.3428119495272675, + "loss/policy_avg": 0.10352663695812225, + "lr": 2.771664110429448e-06, + "objective/entropy": 18.426589965820312, + "objective/kl": 17.08945655822754, + "objective/non_score_reward": -1.7089455127716064, + "objective/rlhf_reward": -6.43578211069107, + "objective/scores": 0.1, + "policy/approxkl_avg": 130.82119750976562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8962185382843018, + "step": 1191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980072975158691 + }, + { + "episode": 19088, + "epoch": 0.34309954344465615, + "loss/policy_avg": 0.2877599000930786, + "lr": 2.7714723926380367e-06, + "objective/entropy": 67.25830841064453, + "objective/kl": 13.41183090209961, + "objective/non_score_reward": -1.341183066368103, + "objective/rlhf_reward": -7.364732265472412, + "objective/scores": -0.5, + "policy/approxkl_avg": 136.07571411132812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.66959547996521, + "step": 1192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997651219367981 + }, + { + "episode": 19104, + "epoch": 0.3433871373620448, + "loss/policy_avg": 0.3532646894454956, + "lr": 2.7712806748466256e-06, + "objective/entropy": 232.93739318847656, + "objective/kl": 11.582132339477539, + "objective/non_score_reward": -1.1582132577896118, + "objective/rlhf_reward": -4.232852792739868, + "objective/scores": 0.1, + "policy/approxkl_avg": 66.96092224121094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9691135883331299, + "step": 1193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989712238311768 + }, + { + "episode": 19120, + "epoch": 0.34367473127943343, + "loss/policy_avg": 0.08443383872509003, + "lr": 2.771088957055215e-06, + "objective/entropy": 12.634071350097656, + "objective/kl": 14.32248592376709, + "objective/non_score_reward": -1.432248592376709, + "objective/rlhf_reward": -7.728994369506836, + "objective/scores": -0.5, + "policy/approxkl_avg": 155.06851196289062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6068894863128662, + "step": 1194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997370958328247 + }, + { + "episode": 19136, + "epoch": 0.34396232519682207, + "loss/policy_avg": 0.05074826255440712, + "lr": 2.7708972392638036e-06, + "objective/entropy": -49.18777847290039, + "objective/kl": 16.202213287353516, + "objective/non_score_reward": -1.6202213764190674, + "objective/rlhf_reward": -4.3581790349641185, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 74.01683044433594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6107933521270752, + "step": 1195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9966356754302979 + }, + { + "episode": 19152, + "epoch": 0.3442499191142107, + "loss/policy_avg": 1.3108307123184204, + "lr": 2.770705521472393e-06, + "objective/entropy": 54.5289421081543, + "objective/kl": 15.158395767211914, + "objective/non_score_reward": -1.5158395767211914, + "objective/rlhf_reward": -5.6633586347103115, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.8598058223724365, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.6213253736495972, + "step": 1196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0031495094299316 + }, + { + "episode": 19168, + "epoch": 0.3445375130315994, + "loss/policy_avg": 0.09720651805400848, + "lr": 2.7705138036809816e-06, + "objective/entropy": 94.39827728271484, + "objective/kl": 7.398892402648926, + "objective/non_score_reward": -0.7398892641067505, + "objective/rlhf_reward": -0.5595570117235185, + "objective/scores": 0.6, + "policy/approxkl_avg": 35.56739044189453, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7716015577316284, + "step": 1197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994008541107178 + }, + { + "episode": 19184, + "epoch": 0.34482510694898805, + "loss/policy_avg": 0.4629075527191162, + "lr": 2.7703220858895705e-06, + "objective/entropy": 152.28797912597656, + "objective/kl": 12.815742492675781, + "objective/non_score_reward": -1.2815742492675781, + "objective/rlhf_reward": -4.726297056674957, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.886993408203125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.603091299533844, + "step": 1198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996881484985352 + }, + { + "episode": 19200, + "epoch": 0.3451127008663767, + "loss/policy_avg": 0.381091833114624, + "lr": 2.7701303680981597e-06, + "objective/entropy": -184.68792724609375, + "objective/kl": 19.192184448242188, + "objective/non_score_reward": -1.9192183017730713, + "objective/rlhf_reward": -5.852044816288064, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 53.215721130371094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.672134518623352, + "step": 1199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9985284805297852 + }, + { + "episode": 19216, + "epoch": 0.3454002947837653, + "loss/policy_avg": 0.5557818412780762, + "lr": 2.7699386503067485e-06, + "objective/entropy": 74.49488830566406, + "objective/kl": 16.349365234375, + "objective/non_score_reward": -1.6349365711212158, + "objective/rlhf_reward": -3.6160276278269023, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 71.15203857421875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5768604278564453, + "step": 1200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997605800628662 + }, + { + "episode": 19232, + "epoch": 0.34568788870115397, + "loss/policy_avg": 0.3999622166156769, + "lr": 2.7697469325153373e-06, + "objective/entropy": 29.93677520751953, + "objective/kl": 15.555572509765625, + "objective/non_score_reward": -1.5555572509765625, + "objective/rlhf_reward": -4.618109259668904, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 138.052978515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6741936206817627, + "step": 1201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9976664781570435 + }, + { + "episode": 19248, + "epoch": 0.3459754826185426, + "loss/policy_avg": -0.13153797388076782, + "lr": 2.7695552147239265e-06, + "objective/entropy": -79.21650695800781, + "objective/kl": 17.083938598632812, + "objective/non_score_reward": -1.7083938121795654, + "objective/rlhf_reward": -2.433575308322906, + "objective/scores": 1.1, + "policy/approxkl_avg": 41.158538818359375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5853614807128906, + "step": 1202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991123676300049 + }, + { + "episode": 19264, + "epoch": 0.34626307653593125, + "loss/policy_avg": 0.5117297172546387, + "lr": 2.7693634969325153e-06, + "objective/entropy": 2.7270851135253906, + "objective/kl": 17.102258682250977, + "objective/non_score_reward": -1.7102259397506714, + "objective/rlhf_reward": -8.840904235839844, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.606922149658203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8073578476905823, + "step": 1203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986017942428589 + }, + { + "episode": 19280, + "epoch": 0.3465506704533199, + "loss/policy_avg": 0.020893381908535957, + "lr": 2.7691717791411046e-06, + "objective/entropy": -169.71463012695312, + "objective/kl": 7.1462202072143555, + "objective/non_score_reward": -0.7146221399307251, + "objective/rlhf_reward": -2.4584885299205776, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.70250701904297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5930685997009277, + "step": 1204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004806518554688 + }, + { + "episode": 19296, + "epoch": 0.3468382643707086, + "loss/policy_avg": 0.5275117754936218, + "lr": 2.7689800613496934e-06, + "objective/entropy": 21.12427520751953, + "objective/kl": 20.648849487304688, + "objective/non_score_reward": -2.064885139465332, + "objective/rlhf_reward": -5.859540140628814, + "objective/scores": 0.6, + "policy/approxkl_avg": 26.942766189575195, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6685649156570435, + "step": 1205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997446894645691 + }, + { + "episode": 19312, + "epoch": 0.3471258582880972, + "loss/policy_avg": -0.19722406566143036, + "lr": 2.768788343558282e-06, + "objective/entropy": 255.74984741210938, + "objective/kl": 14.213126182556152, + "objective/non_score_reward": -1.4213125705718994, + "objective/rlhf_reward": -2.761531566024992, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 18.114723205566406, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7719753980636597, + "step": 1206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006258487701416 + }, + { + "episode": 19328, + "epoch": 0.34741345220548586, + "loss/policy_avg": 0.0459427610039711, + "lr": 2.7685966257668714e-06, + "objective/entropy": -24.662124633789062, + "objective/kl": 11.61572265625, + "objective/non_score_reward": -1.1615723371505737, + "objective/rlhf_reward": -2.9129559258619944, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 10.064481735229492, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6453742384910583, + "step": 1207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978961944580078 + }, + { + "episode": 19344, + "epoch": 0.3477010461228745, + "loss/policy_avg": 0.03587307780981064, + "lr": 2.7684049079754602e-06, + "objective/entropy": 79.69007110595703, + "objective/kl": 15.860215187072754, + "objective/non_score_reward": -1.5860217809677124, + "objective/rlhf_reward": -3.9440871238708493, + "objective/scores": 0.6, + "policy/approxkl_avg": 59.39844512939453, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7233803272247314, + "step": 1208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987972974777222 + }, + { + "episode": 19360, + "epoch": 0.34798864004026314, + "loss/policy_avg": 0.338050901889801, + "lr": 2.7682131901840495e-06, + "objective/entropy": 172.87771606445312, + "objective/kl": 15.573002815246582, + "objective/non_score_reward": -1.557300329208374, + "objective/rlhf_reward": -5.829201197624206, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.08805084228516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.425936222076416, + "step": 1209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99677574634552 + }, + { + "episode": 19376, + "epoch": 0.3482762339576518, + "loss/policy_avg": 0.1206553503870964, + "lr": 2.7680214723926383e-06, + "objective/entropy": 82.61885070800781, + "objective/kl": 18.71231460571289, + "objective/non_score_reward": -1.8712315559387207, + "objective/rlhf_reward": -9.484926223754883, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.68077087402344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6749487519264221, + "step": 1210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997036457061768 + }, + { + "episode": 19392, + "epoch": 0.3485638278750404, + "loss/policy_avg": -0.030552543699741364, + "lr": 2.767829754601227e-06, + "objective/entropy": 258.60968017578125, + "objective/kl": 12.804272651672363, + "objective/non_score_reward": -1.28042733669281, + "objective/rlhf_reward": -4.72170922756195, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.8165931701660156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6673685908317566, + "step": 1211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015029907226562 + }, + { + "episode": 19408, + "epoch": 0.3488514217924291, + "loss/policy_avg": 0.26328012347221375, + "lr": 2.767638036809816e-06, + "objective/entropy": 81.95811462402344, + "objective/kl": 10.576614379882812, + "objective/non_score_reward": -1.0576614141464233, + "objective/rlhf_reward": -6.230645656585693, + "objective/scores": -0.5, + "policy/approxkl_avg": 53.74596405029297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5103533267974854, + "step": 1212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0021772384643555 + }, + { + "episode": 19424, + "epoch": 0.34913901570981776, + "loss/policy_avg": 0.7071826457977295, + "lr": 2.7674463190184047e-06, + "objective/entropy": 284.74810791015625, + "objective/kl": 14.454414367675781, + "objective/non_score_reward": -1.4454416036605835, + "objective/rlhf_reward": -1.3817664146423336, + "objective/scores": 1.1, + "policy/approxkl_avg": 162.52574157714844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.698977530002594, + "step": 1213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988505840301514 + }, + { + "episode": 19440, + "epoch": 0.3494266096272064, + "loss/policy_avg": 0.5319777131080627, + "lr": 2.767254601226994e-06, + "objective/entropy": 7.809246063232422, + "objective/kl": 18.88573455810547, + "objective/non_score_reward": -1.8885735273361206, + "objective/rlhf_reward": -7.154294228553772, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.536239624023438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5789269804954529, + "step": 1214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986400604248047 + }, + { + "episode": 19456, + "epoch": 0.34971420354459504, + "loss/policy_avg": 0.086372971534729, + "lr": 2.7670628834355828e-06, + "objective/entropy": -125.92728424072266, + "objective/kl": 6.277002334594727, + "objective/non_score_reward": -0.6277002692222595, + "objective/rlhf_reward": 0.41291793739679195, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 32.177825927734375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5431735515594482, + "step": 1215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989644289016724 + }, + { + "episode": 19472, + "epoch": 0.3500017974619837, + "loss/policy_avg": 0.684249997138977, + "lr": 2.7668711656441716e-06, + "objective/entropy": 105.52658081054688, + "objective/kl": 14.267902374267578, + "objective/non_score_reward": -1.4267902374267578, + "objective/rlhf_reward": -5.307160621881485, + "objective/scores": 0.1, + "policy/approxkl_avg": 172.38299560546875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49907970428466797, + "step": 1216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985556602478027 + }, + { + "episode": 19488, + "epoch": 0.3502893913793723, + "loss/policy_avg": 0.1681925654411316, + "lr": 2.766679447852761e-06, + "objective/entropy": -46.325313568115234, + "objective/kl": 10.954778671264648, + "objective/non_score_reward": -1.095477819442749, + "objective/rlhf_reward": -1.458192323089811, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 19.582189559936523, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5834884643554688, + "step": 1217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000382423400879 + }, + { + "episode": 19504, + "epoch": 0.35057698529676096, + "loss/policy_avg": 0.17251908779144287, + "lr": 2.7664877300613496e-06, + "objective/entropy": -134.26455688476562, + "objective/kl": 12.687049865722656, + "objective/non_score_reward": -1.268705129623413, + "objective/rlhf_reward": -7.074820518493652, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.018173217773438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4345284700393677, + "step": 1218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9991281032562256 + }, + { + "episode": 19520, + "epoch": 0.3508645792141496, + "loss/policy_avg": 0.17196732759475708, + "lr": 2.766296012269939e-06, + "objective/entropy": -33.67222595214844, + "objective/kl": 13.162768363952637, + "objective/non_score_reward": -1.3162769079208374, + "objective/rlhf_reward": -0.8651078552007672, + "objective/scores": 1.1, + "policy/approxkl_avg": 27.336265563964844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5979089736938477, + "step": 1219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985971450805664 + }, + { + "episode": 19536, + "epoch": 0.3511521731315383, + "loss/policy_avg": 0.09419011324644089, + "lr": 2.7661042944785276e-06, + "objective/entropy": 13.70315170288086, + "objective/kl": 11.582527160644531, + "objective/non_score_reward": -1.1582528352737427, + "objective/rlhf_reward": -6.633011341094971, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.180685043334961, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6442111730575562, + "step": 1220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0029656887054443 + }, + { + "episode": 19552, + "epoch": 0.35143976704892693, + "loss/policy_avg": 0.2863805294036865, + "lr": 2.7659125766871165e-06, + "objective/entropy": 245.30551147460938, + "objective/kl": 16.196739196777344, + "objective/non_score_reward": -1.6196739673614502, + "objective/rlhf_reward": -8.4786958694458, + "objective/scores": -0.5, + "policy/approxkl_avg": 65.83010864257812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7952322959899902, + "step": 1221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992876052856445 + }, + { + "episode": 19568, + "epoch": 0.35172736096631557, + "loss/policy_avg": 0.6682702302932739, + "lr": 2.7657208588957057e-06, + "objective/entropy": -27.104393005371094, + "objective/kl": 8.179853439331055, + "objective/non_score_reward": -0.8179854154586792, + "objective/rlhf_reward": -2.8719416022300717, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.878311157226562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4288652837276459, + "step": 1222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993765354156494 + }, + { + "episode": 19584, + "epoch": 0.3520149548837042, + "loss/policy_avg": 0.33719485998153687, + "lr": 2.7655291411042945e-06, + "objective/entropy": -138.77658081054688, + "objective/kl": 15.182317733764648, + "objective/non_score_reward": -1.5182318687438965, + "objective/rlhf_reward": -8.072927474975586, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.492324829101562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5747877359390259, + "step": 1223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997852087020874 + }, + { + "episode": 19600, + "epoch": 0.35230254880109285, + "loss/policy_avg": 0.4515780210494995, + "lr": 2.7653374233128837e-06, + "objective/entropy": 64.33354949951172, + "objective/kl": 20.334157943725586, + "objective/non_score_reward": -2.0334157943725586, + "objective/rlhf_reward": -10.133663177490234, + "objective/scores": -0.5, + "policy/approxkl_avg": 121.74520874023438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7705545425415039, + "step": 1224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9969508647918701 + }, + { + "episode": 19616, + "epoch": 0.3525901427184815, + "loss/policy_avg": 0.470808744430542, + "lr": 2.7651457055214725e-06, + "objective/entropy": 24.672637939453125, + "objective/kl": 17.567148208618164, + "objective/non_score_reward": -1.7567150592803955, + "objective/rlhf_reward": -6.626860237121582, + "objective/scores": 0.1, + "policy/approxkl_avg": 145.18096923828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.571911096572876, + "step": 1225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998140573501587 + }, + { + "episode": 19632, + "epoch": 0.35287773663587013, + "loss/policy_avg": 0.35610437393188477, + "lr": 2.7649539877300614e-06, + "objective/entropy": 52.07497024536133, + "objective/kl": 25.290016174316406, + "objective/non_score_reward": -2.5290017127990723, + "objective/rlhf_reward": -9.716007149219514, + "objective/scores": 0.1, + "policy/approxkl_avg": 150.7601318359375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6791574954986572, + "step": 1226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0010275840759277 + }, + { + "episode": 19648, + "epoch": 0.35316533055325877, + "loss/policy_avg": 0.5662277936935425, + "lr": 2.7647622699386506e-06, + "objective/entropy": 201.03201293945312, + "objective/kl": 13.793638229370117, + "objective/non_score_reward": -1.379363775253296, + "objective/rlhf_reward": -1.117455160617828, + "objective/scores": 1.1, + "policy/approxkl_avg": 86.15937805175781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6101793050765991, + "step": 1227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986412525177002 + }, + { + "episode": 19664, + "epoch": 0.35345292447064747, + "loss/policy_avg": 0.14119097590446472, + "lr": 2.7645705521472394e-06, + "objective/entropy": 205.80929565429688, + "objective/kl": 14.83749008178711, + "objective/non_score_reward": -1.4837491512298584, + "objective/rlhf_reward": -1.5349966049194332, + "objective/scores": 1.1, + "policy/approxkl_avg": 63.156280517578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7360595464706421, + "step": 1228, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998273253440857 + }, + { + "episode": 19680, + "epoch": 0.3537405183880361, + "loss/policy_avg": 0.270801305770874, + "lr": 2.764378834355828e-06, + "objective/entropy": 201.06890869140625, + "objective/kl": 21.808286666870117, + "objective/non_score_reward": -2.180828809738159, + "objective/rlhf_reward": -8.323315000534057, + "objective/scores": 0.1, + "policy/approxkl_avg": 96.15478515625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7318389415740967, + "step": 1229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991538524627686 + }, + { + "episode": 19696, + "epoch": 0.35402811230542475, + "loss/policy_avg": -0.37473565340042114, + "lr": 2.7641871165644174e-06, + "objective/entropy": 160.452880859375, + "objective/kl": 10.46506118774414, + "objective/non_score_reward": -1.0465059280395508, + "objective/rlhf_reward": -1.262304966093275, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 75.31881713867188, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7993210554122925, + "step": 1230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0051486492156982 + }, + { + "episode": 19712, + "epoch": 0.3543157062228134, + "loss/policy_avg": 0.08775840699672699, + "lr": 2.7639953987730062e-06, + "objective/entropy": -58.0557861328125, + "objective/kl": 16.303316116333008, + "objective/non_score_reward": -1.6303316354751587, + "objective/rlhf_reward": -6.121326541900634, + "objective/scores": 0.1, + "policy/approxkl_avg": 36.74671173095703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6570830345153809, + "step": 1231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998000979423523 + }, + { + "episode": 19728, + "epoch": 0.354603300140202, + "loss/policy_avg": 0.24697911739349365, + "lr": 2.7638036809815955e-06, + "objective/entropy": -103.66670227050781, + "objective/kl": 13.675052642822266, + "objective/non_score_reward": -1.3675053119659424, + "objective/rlhf_reward": -3.645192380222391, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 8.908821105957031, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5298190712928772, + "step": 1232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975241422653198 + }, + { + "episode": 19744, + "epoch": 0.35489089405759067, + "loss/policy_avg": 0.0736452266573906, + "lr": 2.7636119631901843e-06, + "objective/entropy": -149.5491943359375, + "objective/kl": 13.649198532104492, + "objective/non_score_reward": -1.3649197816848755, + "objective/rlhf_reward": -3.3369730732598644, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 97.76475524902344, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6775115132331848, + "step": 1233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988733530044556 + }, + { + "episode": 19760, + "epoch": 0.3551784879749793, + "loss/policy_avg": 0.30038145184516907, + "lr": 2.763420245398773e-06, + "objective/entropy": 9.354202270507812, + "objective/kl": 13.509121894836426, + "objective/non_score_reward": -1.350912094116211, + "objective/rlhf_reward": -5.003648361563682, + "objective/scores": 0.1, + "policy/approxkl_avg": 48.25217819213867, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3075612783432007, + "step": 1234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0003974437713623 + }, + { + "episode": 19776, + "epoch": 0.355466081892368, + "loss/policy_avg": 0.41265279054641724, + "lr": 2.763228527607362e-06, + "objective/entropy": -128.67822265625, + "objective/kl": 13.989595413208008, + "objective/non_score_reward": -1.3989596366882324, + "objective/rlhf_reward": -4.039579230305508, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 38.396392822265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5274858474731445, + "step": 1235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000598907470703 + }, + { + "episode": 19792, + "epoch": 0.35575367580975664, + "loss/policy_avg": 0.16135510802268982, + "lr": 2.7630368098159507e-06, + "objective/entropy": -96.63204956054688, + "objective/kl": 12.73376178741455, + "objective/non_score_reward": -1.273376226425171, + "objective/rlhf_reward": -0.6935049951076504, + "objective/scores": 1.1, + "policy/approxkl_avg": 39.10956573486328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.670026421546936, + "step": 1236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0001089572906494 + }, + { + "episode": 19808, + "epoch": 0.3560412697271453, + "loss/policy_avg": 0.7625874280929565, + "lr": 2.76284509202454e-06, + "objective/entropy": -326.073974609375, + "objective/kl": 7.750607967376709, + "objective/non_score_reward": -0.775060772895813, + "objective/rlhf_reward": -2.700243210792541, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.49315643310547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7181180715560913, + "step": 1237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.999666690826416 + }, + { + "episode": 19824, + "epoch": 0.3563288636445339, + "loss/policy_avg": 0.008348610252141953, + "lr": 2.7626533742331288e-06, + "objective/entropy": -314.941650390625, + "objective/kl": 10.41222095489502, + "objective/non_score_reward": -1.0412222146987915, + "objective/rlhf_reward": -3.7648888289928433, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.16736602783203, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6663084626197815, + "step": 1238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.00093936920166 + }, + { + "episode": 19840, + "epoch": 0.35661645756192256, + "loss/policy_avg": 0.36450040340423584, + "lr": 2.7624616564417176e-06, + "objective/entropy": -93.8783950805664, + "objective/kl": 10.081415176391602, + "objective/non_score_reward": -1.0081413984298706, + "objective/rlhf_reward": -3.632565504312515, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.1667366027832, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6830465197563171, + "step": 1239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9981257915496826 + }, + { + "episode": 19856, + "epoch": 0.3569040514793112, + "loss/policy_avg": 0.29206255078315735, + "lr": 2.762269938650307e-06, + "objective/entropy": 194.64642333984375, + "objective/kl": 20.66879653930664, + "objective/non_score_reward": -2.0668797492980957, + "objective/rlhf_reward": -7.867519116401672, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.687646865844727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.741154670715332, + "step": 1240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989938735961914 + }, + { + "episode": 19872, + "epoch": 0.35719164539669984, + "loss/policy_avg": 0.3017123341560364, + "lr": 2.7620782208588956e-06, + "objective/entropy": 5.207828521728516, + "objective/kl": 17.089153289794922, + "objective/non_score_reward": -1.70891535282135, + "objective/rlhf_reward": -8.835660934448242, + "objective/scores": -0.5, + "policy/approxkl_avg": 90.6629638671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5502313375473022, + "step": 1241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000894069671631 + }, + { + "episode": 19888, + "epoch": 0.3574792393140885, + "loss/policy_avg": 0.44059765338897705, + "lr": 2.761886503067485e-06, + "objective/entropy": 93.31766510009766, + "objective/kl": 17.86589241027832, + "objective/non_score_reward": -1.7865893840789795, + "objective/rlhf_reward": -2.7463575363159176, + "objective/scores": 1.1, + "policy/approxkl_avg": 94.84536743164062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7133783102035522, + "step": 1242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99619722366333 + }, + { + "episode": 19904, + "epoch": 0.3577668332314772, + "loss/policy_avg": 0.29421958327293396, + "lr": 2.7616947852760737e-06, + "objective/entropy": 1.1859626770019531, + "objective/kl": 19.047836303710938, + "objective/non_score_reward": -1.9047834873199463, + "objective/rlhf_reward": -7.2191343069076535, + "objective/scores": 0.1, + "policy/approxkl_avg": 83.07100677490234, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4693220257759094, + "step": 1243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991942644119263 + }, + { + "episode": 19920, + "epoch": 0.3580544271488658, + "loss/policy_avg": 0.2835289239883423, + "lr": 2.7615030674846625e-06, + "objective/entropy": 215.6124267578125, + "objective/kl": 13.682714462280273, + "objective/non_score_reward": -1.3682713508605957, + "objective/rlhf_reward": -2.549366746784422, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 10.411121368408203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8226298093795776, + "step": 1244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999145269393921 + }, + { + "episode": 19936, + "epoch": 0.35834202106625446, + "loss/policy_avg": 0.4547170400619507, + "lr": 2.7613113496932517e-06, + "objective/entropy": 76.82644653320312, + "objective/kl": 22.442930221557617, + "objective/non_score_reward": -2.244292736053467, + "objective/rlhf_reward": -8.577171301841737, + "objective/scores": 0.1, + "policy/approxkl_avg": 93.25566101074219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7442919015884399, + "step": 1245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.995746374130249 + }, + { + "episode": 19952, + "epoch": 0.3586296149836431, + "loss/policy_avg": 0.33403638005256653, + "lr": 2.7611196319018405e-06, + "objective/entropy": 315.484130859375, + "objective/kl": 18.028207778930664, + "objective/non_score_reward": -1.8028206825256348, + "objective/rlhf_reward": -4.811282730102539, + "objective/scores": 0.6, + "policy/approxkl_avg": 71.87936401367188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7787157297134399, + "step": 1246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999906301498413 + }, + { + "episode": 19968, + "epoch": 0.35891720890103174, + "loss/policy_avg": 0.4515129625797272, + "lr": 2.7609279141104297e-06, + "objective/entropy": 49.19495391845703, + "objective/kl": 16.58286476135254, + "objective/non_score_reward": -1.658286452293396, + "objective/rlhf_reward": -4.808317120346139, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 22.461732864379883, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5894609689712524, + "step": 1247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996263980865479 + }, + { + "episode": 19984, + "epoch": 0.3592048028184204, + "loss/policy_avg": 0.3973531723022461, + "lr": 2.7607361963190186e-06, + "objective/entropy": -21.238182067871094, + "objective/kl": 18.508586883544922, + "objective/non_score_reward": -1.8508589267730713, + "objective/rlhf_reward": -5.456024329142506, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 67.94461822509766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7065160274505615, + "step": 1248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000941514968872 + }, + { + "episode": 20000, + "epoch": 0.359492396735809, + "loss/policy_avg": 0.23100371658802032, + "lr": 2.7605444785276074e-06, + "objective/entropy": -163.1217041015625, + "objective/kl": 14.045398712158203, + "objective/non_score_reward": -1.4045398235321045, + "objective/rlhf_reward": -5.218158936500549, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.335323333740234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8941419720649719, + "step": 1249, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0009543895721436 + }, + { + "episode": 20016, + "epoch": 0.3597799906531977, + "loss/policy_avg": 0.26674309372901917, + "lr": 2.7603527607361966e-06, + "objective/entropy": 37.34232711791992, + "objective/kl": 18.331321716308594, + "objective/non_score_reward": -1.833132028579712, + "objective/rlhf_reward": -2.932528471946716, + "objective/scores": 1.1, + "policy/approxkl_avg": 19.882667541503906, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7180185317993164, + "step": 1250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0010123252868652 + }, + { + "episode": 20032, + "epoch": 0.36006758457058635, + "loss/policy_avg": -0.03628786280751228, + "lr": 2.7601610429447854e-06, + "objective/entropy": 8.678840637207031, + "objective/kl": 13.700248718261719, + "objective/non_score_reward": -1.3700249195098877, + "objective/rlhf_reward": -7.480099678039551, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.891830444335938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6439998149871826, + "step": 1251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.004136085510254 + }, + { + "episode": 20048, + "epoch": 0.360355178487975, + "loss/policy_avg": 0.038509830832481384, + "lr": 2.759969325153374e-06, + "objective/entropy": -88.04098510742188, + "objective/kl": 11.22274112701416, + "objective/non_score_reward": -1.1222741603851318, + "objective/rlhf_reward": -2.089096477627754, + "objective/scores": 0.6, + "policy/approxkl_avg": 32.534996032714844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.532057523727417, + "step": 1252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985415935516357 + }, + { + "episode": 20064, + "epoch": 0.36064277240536363, + "loss/policy_avg": 0.3246498703956604, + "lr": 2.7597776073619634e-06, + "objective/entropy": -265.6895751953125, + "objective/kl": 17.194293975830078, + "objective/non_score_reward": -1.71942937374115, + "objective/rlhf_reward": -6.47771737575531, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.17576026916504, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6558340787887573, + "step": 1253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9985039234161377 + }, + { + "episode": 20080, + "epoch": 0.36093036632275227, + "loss/policy_avg": 0.34922367334365845, + "lr": 2.7595858895705523e-06, + "objective/entropy": 323.5899658203125, + "objective/kl": 16.424396514892578, + "objective/non_score_reward": -1.642439842224121, + "objective/rlhf_reward": -8.569759368896484, + "objective/scores": -0.5, + "policy/approxkl_avg": 119.47270965576172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8957875967025757, + "step": 1254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001938343048096 + }, + { + "episode": 20096, + "epoch": 0.3612179602401409, + "loss/policy_avg": 0.1072012335062027, + "lr": 2.7593941717791415e-06, + "objective/entropy": -119.50285339355469, + "objective/kl": 15.749418258666992, + "objective/non_score_reward": -1.574941873550415, + "objective/rlhf_reward": -5.899767941236496, + "objective/scores": 0.1, + "policy/approxkl_avg": 87.79750061035156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41685575246810913, + "step": 1255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994564056396484 + }, + { + "episode": 20112, + "epoch": 0.36150555415752955, + "loss/policy_avg": -0.09395378828048706, + "lr": 2.75920245398773e-06, + "objective/entropy": 104.20881652832031, + "objective/kl": 14.162569046020508, + "objective/non_score_reward": -1.4162569046020508, + "objective/rlhf_reward": -7.665027618408203, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.374798774719238, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7825880646705627, + "step": 1256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0023365020751953 + }, + { + "episode": 20128, + "epoch": 0.3617931480749182, + "loss/policy_avg": 0.028538435697555542, + "lr": 2.759010736196319e-06, + "objective/entropy": -46.99930953979492, + "objective/kl": 12.82414436340332, + "objective/non_score_reward": -1.282414436340332, + "objective/rlhf_reward": -7.129657745361328, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.753807067871094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7124722599983215, + "step": 1257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0020642280578613 + }, + { + "episode": 20144, + "epoch": 0.3620807419923069, + "loss/policy_avg": -0.021636590361595154, + "lr": 2.758819018404908e-06, + "objective/entropy": -221.2742919921875, + "objective/kl": 9.566751480102539, + "objective/non_score_reward": -0.956675112247467, + "objective/rlhf_reward": 0.573299536108971, + "objective/scores": 1.1, + "policy/approxkl_avg": 18.7915096282959, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5591281652450562, + "step": 1258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001438617706299 + }, + { + "episode": 20160, + "epoch": 0.3623683359096955, + "loss/policy_avg": -0.04480976611375809, + "lr": 2.7586273006134967e-06, + "objective/entropy": 240.30905151367188, + "objective/kl": 14.567556381225586, + "objective/non_score_reward": -1.4567558765411377, + "objective/rlhf_reward": -7.827023506164551, + "objective/scores": -0.5, + "policy/approxkl_avg": 54.074058532714844, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6273489594459534, + "step": 1259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984817504882812 + }, + { + "episode": 20176, + "epoch": 0.36265592982708417, + "loss/policy_avg": 0.4374186396598816, + "lr": 2.758435582822086e-06, + "objective/entropy": 188.3015594482422, + "objective/kl": 17.31353759765625, + "objective/non_score_reward": -1.731353998184204, + "objective/rlhf_reward": -4.802709641233955, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 31.519737243652344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8211510181427002, + "step": 1260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0011744499206543 + }, + { + "episode": 20192, + "epoch": 0.3629435237444728, + "loss/policy_avg": 0.40651851892471313, + "lr": 2.7582438650306748e-06, + "objective/entropy": -213.75115966796875, + "objective/kl": 13.607375144958496, + "objective/non_score_reward": -1.3607374429702759, + "objective/rlhf_reward": -5.042950010299682, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.90991973876953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5784170031547546, + "step": 1261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9970366954803467 + }, + { + "episode": 20208, + "epoch": 0.36323111766186145, + "loss/policy_avg": 0.08956024795770645, + "lr": 2.758052147239264e-06, + "objective/entropy": 154.9608612060547, + "objective/kl": 15.312253952026367, + "objective/non_score_reward": -1.5312254428863525, + "objective/rlhf_reward": -1.724901711940765, + "objective/scores": 1.1, + "policy/approxkl_avg": 106.27772521972656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4395660161972046, + "step": 1262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995371103286743 + }, + { + "episode": 20224, + "epoch": 0.3635187115792501, + "loss/policy_avg": 0.33477357029914856, + "lr": 2.757860429447853e-06, + "objective/entropy": -70.99191284179688, + "objective/kl": 12.952332496643066, + "objective/non_score_reward": -1.2952332496643066, + "objective/rlhf_reward": -3.447599486509959, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 37.827144622802734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7374582290649414, + "step": 1263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.995866298675537 + }, + { + "episode": 20240, + "epoch": 0.3638063054966387, + "loss/policy_avg": 0.6385763883590698, + "lr": 2.7576687116564416e-06, + "objective/entropy": -83.57589721679688, + "objective/kl": 14.134538650512695, + "objective/non_score_reward": -1.4134538173675537, + "objective/rlhf_reward": -5.253815388679504, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.28148651123047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5673624873161316, + "step": 1264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980382919311523 + }, + { + "episode": 20256, + "epoch": 0.36409389941402737, + "loss/policy_avg": 0.1510678082704544, + "lr": 2.757476993865031e-06, + "objective/entropy": -1.0954818725585938, + "objective/kl": 13.511443138122559, + "objective/non_score_reward": -1.3511443138122559, + "objective/rlhf_reward": -1.0045771360397335, + "objective/scores": 1.1, + "policy/approxkl_avg": 36.97186279296875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.557723879814148, + "step": 1265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000786304473877 + }, + { + "episode": 20272, + "epoch": 0.36438149333141606, + "loss/policy_avg": -0.15208236873149872, + "lr": 2.7572852760736197e-06, + "objective/entropy": -118.6660385131836, + "objective/kl": 16.32196807861328, + "objective/non_score_reward": -1.6321969032287598, + "objective/rlhf_reward": -8.528787612915039, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.030231475830078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7827368974685669, + "step": 1266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0016098022460938 + }, + { + "episode": 20288, + "epoch": 0.3646690872488047, + "loss/policy_avg": 0.3386310338973999, + "lr": 2.7570935582822085e-06, + "objective/entropy": 249.75567626953125, + "objective/kl": 14.743573188781738, + "objective/non_score_reward": -1.4743572473526, + "objective/rlhf_reward": -7.8974289894104, + "objective/scores": -0.5, + "policy/approxkl_avg": 85.64590454101562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6248792409896851, + "step": 1267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980359077453613 + }, + { + "episode": 20304, + "epoch": 0.36495668116619334, + "loss/policy_avg": 0.6923346519470215, + "lr": 2.7569018404907977e-06, + "objective/entropy": 117.10751342773438, + "objective/kl": 14.273673057556152, + "objective/non_score_reward": -1.4273674488067627, + "objective/rlhf_reward": -3.884640704068254, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 126.95195770263672, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.622350811958313, + "step": 1268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9966614246368408 + }, + { + "episode": 20320, + "epoch": 0.365244275083582, + "loss/policy_avg": 0.08828192949295044, + "lr": 2.7567101226993865e-06, + "objective/entropy": 95.71147155761719, + "objective/kl": 22.682233810424805, + "objective/non_score_reward": -2.268223285675049, + "objective/rlhf_reward": -7.468773577276783, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 78.45034790039062, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7277878522872925, + "step": 1269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989908933639526 + }, + { + "episode": 20336, + "epoch": 0.3655318690009706, + "loss/policy_avg": 0.41136306524276733, + "lr": 2.7565184049079757e-06, + "objective/entropy": -162.8468780517578, + "objective/kl": 17.89332389831543, + "objective/non_score_reward": -1.7893322706222534, + "objective/rlhf_reward": -2.7573293209075924, + "objective/scores": 1.1, + "policy/approxkl_avg": 37.13008117675781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6133675575256348, + "step": 1270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000051975250244 + }, + { + "episode": 20352, + "epoch": 0.36581946291835926, + "loss/policy_avg": 0.019866986200213432, + "lr": 2.7563266871165646e-06, + "objective/entropy": 217.34902954101562, + "objective/kl": 12.45542049407959, + "objective/non_score_reward": -1.245542049407959, + "objective/rlhf_reward": -4.58216837644577, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.521160125732422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.826188862323761, + "step": 1271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006699562072754 + }, + { + "episode": 20368, + "epoch": 0.3661070568357479, + "loss/policy_avg": 0.07603908330202103, + "lr": 2.7561349693251534e-06, + "objective/entropy": 21.53125762939453, + "objective/kl": 12.73741340637207, + "objective/non_score_reward": -1.2737412452697754, + "objective/rlhf_reward": -0.6949649810791012, + "objective/scores": 1.1, + "policy/approxkl_avg": 49.010589599609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5798752307891846, + "step": 1272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9964756965637207 + }, + { + "episode": 20384, + "epoch": 0.3663946507531366, + "loss/policy_avg": -0.3879421353340149, + "lr": 2.7559432515337426e-06, + "objective/entropy": 135.53102111816406, + "objective/kl": 15.746997833251953, + "objective/non_score_reward": -1.5746999979019165, + "objective/rlhf_reward": -8.298799514770508, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.827058792114258, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6375302076339722, + "step": 1273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0074105262756348 + }, + { + "episode": 20400, + "epoch": 0.36668224467052524, + "loss/policy_avg": 0.15085294842720032, + "lr": 2.7557515337423314e-06, + "objective/entropy": 300.15081787109375, + "objective/kl": 21.906736373901367, + "objective/non_score_reward": -2.190673828125, + "objective/rlhf_reward": -10.7626953125, + "objective/scores": -0.5, + "policy/approxkl_avg": 292.7366943359375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7200959920883179, + "step": 1274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972238540649414 + }, + { + "episode": 20416, + "epoch": 0.3669698385879139, + "loss/policy_avg": 0.47925299406051636, + "lr": 2.7555598159509206e-06, + "objective/entropy": 147.05970764160156, + "objective/kl": 12.889491081237793, + "objective/non_score_reward": -1.2889491319656372, + "objective/rlhf_reward": -2.2320775135767192, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 8.79387378692627, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7686591148376465, + "step": 1275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991161823272705 + }, + { + "episode": 20432, + "epoch": 0.3672574325053025, + "loss/policy_avg": 0.5244907140731812, + "lr": 2.7553680981595095e-06, + "objective/entropy": -89.96923828125, + "objective/kl": 9.46074390411377, + "objective/non_score_reward": -0.946074366569519, + "objective/rlhf_reward": -1.3842974662780758, + "objective/scores": 0.6, + "policy/approxkl_avg": 65.30824279785156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.698806881904602, + "step": 1276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000295400619507 + }, + { + "episode": 20448, + "epoch": 0.36754502642269116, + "loss/policy_avg": 0.6136975288391113, + "lr": 2.7551763803680983e-06, + "objective/entropy": 7.9844207763671875, + "objective/kl": 11.841961860656738, + "objective/non_score_reward": -1.1841962337493896, + "objective/rlhf_reward": -0.3367847561836239, + "objective/scores": 1.1, + "policy/approxkl_avg": 87.70883178710938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.643621027469635, + "step": 1277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997437000274658 + }, + { + "episode": 20464, + "epoch": 0.3678326203400798, + "loss/policy_avg": -0.00877484492957592, + "lr": 2.7549846625766875e-06, + "objective/entropy": -142.05413818359375, + "objective/kl": 13.480876922607422, + "objective/non_score_reward": -1.3480876684188843, + "objective/rlhf_reward": -2.992350524663925, + "objective/scores": 0.6, + "policy/approxkl_avg": 101.64769744873047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.591066300868988, + "step": 1278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975664615631104 + }, + { + "episode": 20480, + "epoch": 0.36812021425746844, + "loss/policy_avg": 0.04989251866936684, + "lr": 2.754792944785276e-06, + "objective/entropy": 3.4066810607910156, + "objective/kl": 14.480069160461426, + "objective/non_score_reward": -1.4480068683624268, + "objective/rlhf_reward": -7.792027473449707, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.06435775756836, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7045605182647705, + "step": 1279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9972350597381592 + }, + { + "episode": 20496, + "epoch": 0.3684078081748571, + "loss/policy_avg": 1.8392304182052612, + "lr": 2.754601226993865e-06, + "objective/entropy": 46.341854095458984, + "objective/kl": 17.002796173095703, + "objective/non_score_reward": -1.700279712677002, + "objective/rlhf_reward": -2.401118969917297, + "objective/scores": 1.1, + "policy/approxkl_avg": 13.771295547485352, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7492488026618958, + "step": 1280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002211570739746 + }, + { + "episode": 20512, + "epoch": 0.36869540209224577, + "loss/policy_avg": 0.062071219086647034, + "lr": 2.754409509202454e-06, + "objective/entropy": -166.6836700439453, + "objective/kl": 15.558152198791504, + "objective/non_score_reward": -1.5558152198791504, + "objective/rlhf_reward": -4.398432280096124, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 12.791590690612793, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5931823253631592, + "step": 1281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000124454498291 + }, + { + "episode": 20528, + "epoch": 0.3689829960096344, + "loss/policy_avg": 0.6587561368942261, + "lr": 2.7542177914110427e-06, + "objective/entropy": 28.02899932861328, + "objective/kl": 11.468122482299805, + "objective/non_score_reward": -1.1468123197555542, + "objective/rlhf_reward": -2.7624205901947727, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 50.253570556640625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7729257345199585, + "step": 1282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9982600212097168 + }, + { + "episode": 20544, + "epoch": 0.36927058992702305, + "loss/policy_avg": 0.16267871856689453, + "lr": 2.754026073619632e-06, + "objective/entropy": 167.5796661376953, + "objective/kl": 12.056804656982422, + "objective/non_score_reward": -1.2056803703308105, + "objective/rlhf_reward": -4.4227216899394985, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.177530288696289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.665571391582489, + "step": 1283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999282360076904 + }, + { + "episode": 20560, + "epoch": 0.3695581838444117, + "loss/policy_avg": 0.10843977332115173, + "lr": 2.7538343558282208e-06, + "objective/entropy": -65.32721710205078, + "objective/kl": 9.67951774597168, + "objective/non_score_reward": -0.9679518938064575, + "objective/rlhf_reward": 0.5281925439834598, + "objective/scores": 1.1, + "policy/approxkl_avg": 28.627899169921875, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5553978085517883, + "step": 1284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0003318786621094 + }, + { + "episode": 20576, + "epoch": 0.36984577776180033, + "loss/policy_avg": 0.09218505769968033, + "lr": 2.75364263803681e-06, + "objective/entropy": -287.59014892578125, + "objective/kl": 10.535503387451172, + "objective/non_score_reward": -1.0535502433776855, + "objective/rlhf_reward": 0.18579889237880742, + "objective/scores": 1.1, + "policy/approxkl_avg": 33.40215301513672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6041494607925415, + "step": 1285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.9981060028076172 + }, + { + "episode": 20592, + "epoch": 0.37013337167918897, + "loss/policy_avg": 0.4806681275367737, + "lr": 2.753450920245399e-06, + "objective/entropy": 114.24024200439453, + "objective/kl": 12.809671401977539, + "objective/non_score_reward": -1.2809672355651855, + "objective/rlhf_reward": -3.001162292734657, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 103.79825592041016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7868355512619019, + "step": 1286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987444877624512 + }, + { + "episode": 20608, + "epoch": 0.3704209655965776, + "loss/policy_avg": 0.4517222046852112, + "lr": 2.7532592024539876e-06, + "objective/entropy": -82.73650360107422, + "objective/kl": 16.96005630493164, + "objective/non_score_reward": -1.6960057020187378, + "objective/rlhf_reward": -8.78402328491211, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.851829528808594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7056453824043274, + "step": 1287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979089498519897 + }, + { + "episode": 20624, + "epoch": 0.3707085595139663, + "loss/policy_avg": 0.3327828645706177, + "lr": 2.753067484662577e-06, + "objective/entropy": -133.07638549804688, + "objective/kl": 13.776689529418945, + "objective/non_score_reward": -1.377668857574463, + "objective/rlhf_reward": -5.1106756091117855, + "objective/scores": 0.1, + "policy/approxkl_avg": 28.329500198364258, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6216986179351807, + "step": 1288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000491142272949 + }, + { + "episode": 20640, + "epoch": 0.37099615343135495, + "loss/policy_avg": 0.5435598492622375, + "lr": 2.7528757668711657e-06, + "objective/entropy": -170.30508422851562, + "objective/kl": 17.714038848876953, + "objective/non_score_reward": -1.7714040279388428, + "objective/rlhf_reward": -6.685615873336792, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.18598747253418, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7448534965515137, + "step": 1289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992847442626953 + }, + { + "episode": 20656, + "epoch": 0.3712837473487436, + "loss/policy_avg": 0.24354924261569977, + "lr": 2.7526840490797545e-06, + "objective/entropy": 23.606029510498047, + "objective/kl": 26.219905853271484, + "objective/non_score_reward": -2.62199068069458, + "objective/rlhf_reward": -6.087962484359741, + "objective/scores": 1.1, + "policy/approxkl_avg": 57.348243713378906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5495198965072632, + "step": 1290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979891777038574 + }, + { + "episode": 20672, + "epoch": 0.3715713412661322, + "loss/policy_avg": 1.0543159246444702, + "lr": 2.7524923312883437e-06, + "objective/entropy": 263.5877685546875, + "objective/kl": 7.638702392578125, + "objective/non_score_reward": -0.7638702392578125, + "objective/rlhf_reward": 1.3445191025733951, + "objective/scores": 1.1, + "policy/approxkl_avg": 3.8037867546081543, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6338490843772888, + "step": 1291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0179636478424072 + }, + { + "episode": 20688, + "epoch": 0.37185893518352087, + "loss/policy_avg": 0.7852039933204651, + "lr": 2.7523006134969325e-06, + "objective/entropy": 304.3499450683594, + "objective/kl": 19.973011016845703, + "objective/non_score_reward": -1.9973011016845703, + "objective/rlhf_reward": -7.589204466342926, + "objective/scores": 0.1, + "policy/approxkl_avg": 71.57891845703125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9033545255661011, + "step": 1292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971482753753662 + }, + { + "episode": 20704, + "epoch": 0.3721465291009095, + "loss/policy_avg": 0.7660449743270874, + "lr": 2.7521088957055218e-06, + "objective/entropy": 363.7418212890625, + "objective/kl": 17.045547485351562, + "objective/non_score_reward": -1.7045549154281616, + "objective/rlhf_reward": -8.818220138549805, + "objective/scores": -0.5, + "policy/approxkl_avg": 165.32534790039062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.941403329372406, + "step": 1293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998640537261963 + }, + { + "episode": 20720, + "epoch": 0.37243412301829815, + "loss/policy_avg": 0.18262270092964172, + "lr": 2.7519171779141106e-06, + "objective/entropy": 340.91082763671875, + "objective/kl": 15.803611755371094, + "objective/non_score_reward": -1.580361247062683, + "objective/rlhf_reward": -4.765185623374537, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 74.37960815429688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8930048942565918, + "step": 1294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99721360206604 + }, + { + "episode": 20736, + "epoch": 0.3727217169356868, + "loss/policy_avg": 0.5075985193252563, + "lr": 2.7517254601226994e-06, + "objective/entropy": 66.2529296875, + "objective/kl": 10.169906616210938, + "objective/non_score_reward": -1.0169909000396729, + "objective/rlhf_reward": -3.6679632425308224, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.16536331176758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5018965005874634, + "step": 1295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997565746307373 + }, + { + "episode": 20752, + "epoch": 0.3730093108530755, + "loss/policy_avg": 0.4298272132873535, + "lr": 2.7515337423312886e-06, + "objective/entropy": 20.0439453125, + "objective/kl": 13.036293029785156, + "objective/non_score_reward": -1.3036293983459473, + "objective/rlhf_reward": -7.214517116546631, + "objective/scores": -0.5, + "policy/approxkl_avg": 15.878606796264648, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6570239067077637, + "step": 1296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991276264190674 + }, + { + "episode": 20768, + "epoch": 0.3732969047704641, + "loss/policy_avg": 0.5323415994644165, + "lr": 2.7513420245398774e-06, + "objective/entropy": 100.38456726074219, + "objective/kl": 8.541244506835938, + "objective/non_score_reward": -0.8541244268417358, + "objective/rlhf_reward": -3.016497766971588, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.5799808502197266, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.544098973274231, + "step": 1297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0030770301818848 + }, + { + "episode": 20784, + "epoch": 0.37358449868785276, + "loss/policy_avg": 0.0821521133184433, + "lr": 2.7511503067484666e-06, + "objective/entropy": -208.57977294921875, + "objective/kl": 12.208963394165039, + "objective/non_score_reward": -1.2208964824676514, + "objective/rlhf_reward": -6.8835859298706055, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.187795639038086, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6201414465904236, + "step": 1298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.003359317779541 + }, + { + "episode": 20800, + "epoch": 0.3738720926052414, + "loss/policy_avg": 0.21889813244342804, + "lr": 2.7509585889570555e-06, + "objective/entropy": -46.03602600097656, + "objective/kl": 12.725289344787598, + "objective/non_score_reward": -1.272528886795044, + "objective/rlhf_reward": -4.690115666389465, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.959117889404297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.619624674320221, + "step": 1299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985125064849854 + }, + { + "episode": 20816, + "epoch": 0.37415968652263004, + "loss/policy_avg": -0.2133423089981079, + "lr": 2.7507668711656443e-06, + "objective/entropy": 17.793853759765625, + "objective/kl": 12.961200714111328, + "objective/non_score_reward": -1.2961199283599854, + "objective/rlhf_reward": -0.784479862451553, + "objective/scores": 1.1, + "policy/approxkl_avg": 15.770217895507812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5697649717330933, + "step": 1300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999387502670288 + }, + { + "episode": 20832, + "epoch": 0.3744472804400187, + "loss/policy_avg": 0.21750691533088684, + "lr": 2.750575153374233e-06, + "objective/entropy": -119.40364074707031, + "objective/kl": 9.480939865112305, + "objective/non_score_reward": -0.9480940103530884, + "objective/rlhf_reward": -3.392376041412353, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.330104827880859, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.612727165222168, + "step": 1301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.009127616882324 + }, + { + "episode": 20848, + "epoch": 0.3747348743574073, + "loss/policy_avg": 0.4480114281177521, + "lr": 2.750383435582822e-06, + "objective/entropy": 16.765975952148438, + "objective/kl": 16.303150177001953, + "objective/non_score_reward": -1.6303151845932007, + "objective/rlhf_reward": -4.398554409221683, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 132.9036865234375, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6376628279685974, + "step": 1302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9977257251739502 + }, + { + "episode": 20864, + "epoch": 0.37502246827479596, + "loss/policy_avg": -0.39135733246803284, + "lr": 2.750191717791411e-06, + "objective/entropy": -138.0423126220703, + "objective/kl": 9.426933288574219, + "objective/non_score_reward": -0.9426934719085693, + "objective/rlhf_reward": -2.037440554300944, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 17.444637298583984, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5734092593193054, + "step": 1303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.2076222896575928 + }, + { + "episode": 20880, + "epoch": 0.37531006219218466, + "loss/policy_avg": 0.4854779541492462, + "lr": 2.75e-06, + "objective/entropy": 92.62399291992188, + "objective/kl": 16.287330627441406, + "objective/non_score_reward": -1.6287332773208618, + "objective/rlhf_reward": -3.591213975788328, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 11.808944702148438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6390659213066101, + "step": 1304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000646114349365 + }, + { + "episode": 20896, + "epoch": 0.3755976561095733, + "loss/policy_avg": -0.24735336005687714, + "lr": 2.7498082822085887e-06, + "objective/entropy": -53.366703033447266, + "objective/kl": 10.406715393066406, + "objective/non_score_reward": -1.040671467781067, + "objective/rlhf_reward": -6.162686347961426, + "objective/scores": -0.5, + "policy/approxkl_avg": 34.323829650878906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5506607294082642, + "step": 1305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0012283325195312 + }, + { + "episode": 20912, + "epoch": 0.37588525002696194, + "loss/policy_avg": 0.5731884241104126, + "lr": 2.749616564417178e-06, + "objective/entropy": 111.93737030029297, + "objective/kl": 17.37570571899414, + "objective/non_score_reward": -1.7375702857971191, + "objective/rlhf_reward": -8.950281143188477, + "objective/scores": -0.5, + "policy/approxkl_avg": 77.21607208251953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7403782606124878, + "step": 1306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988470077514648 + }, + { + "episode": 20928, + "epoch": 0.3761728439443506, + "loss/policy_avg": 0.2895098328590393, + "lr": 2.7494248466257668e-06, + "objective/entropy": 31.602371215820312, + "objective/kl": 13.965235710144043, + "objective/non_score_reward": -1.3965235948562622, + "objective/rlhf_reward": -3.6386830312775924, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 33.449668884277344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6721315383911133, + "step": 1307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998502492904663 + }, + { + "episode": 20944, + "epoch": 0.3764604378617392, + "loss/policy_avg": -0.17103518545627594, + "lr": 2.749233128834356e-06, + "objective/entropy": 132.74185180664062, + "objective/kl": 20.359312057495117, + "objective/non_score_reward": -2.03593111038208, + "objective/rlhf_reward": -3.7437247991561886, + "objective/scores": 1.1, + "policy/approxkl_avg": 118.31222534179688, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.718961775302887, + "step": 1308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002476692199707 + }, + { + "episode": 20960, + "epoch": 0.37674803177912786, + "loss/policy_avg": 0.14908567070960999, + "lr": 2.749041411042945e-06, + "objective/entropy": 244.53402709960938, + "objective/kl": 15.694480895996094, + "objective/non_score_reward": -1.5694482326507568, + "objective/rlhf_reward": -8.277792930603027, + "objective/scores": -0.5, + "policy/approxkl_avg": 63.194114685058594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6035882234573364, + "step": 1309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997267723083496 + }, + { + "episode": 20976, + "epoch": 0.3770356256965165, + "loss/policy_avg": 0.05589284002780914, + "lr": 2.7488496932515336e-06, + "objective/entropy": 42.838871002197266, + "objective/kl": 14.986385345458984, + "objective/non_score_reward": -1.49863862991333, + "objective/rlhf_reward": -5.5945545271039006, + "objective/scores": 0.1, + "policy/approxkl_avg": 78.51327514648438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.670206606388092, + "step": 1310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988725185394287 + }, + { + "episode": 20992, + "epoch": 0.3773232196139052, + "loss/policy_avg": 0.5007076263427734, + "lr": 2.748657975460123e-06, + "objective/entropy": 176.76974487304688, + "objective/kl": 15.967851638793945, + "objective/non_score_reward": -1.5967851877212524, + "objective/rlhf_reward": -8.387140274047852, + "objective/scores": -0.5, + "policy/approxkl_avg": 53.00636672973633, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6999363899230957, + "step": 1311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997910737991333 + }, + { + "episode": 21008, + "epoch": 0.37761081353129383, + "loss/policy_avg": 1.6480071544647217, + "lr": 2.7484662576687117e-06, + "objective/entropy": 79.01959228515625, + "objective/kl": 17.435348510742188, + "objective/non_score_reward": -1.743534803390503, + "objective/rlhf_reward": -2.5741390645503994, + "objective/scores": 1.1, + "policy/approxkl_avg": 99.76893615722656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7191579341888428, + "step": 1312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991645812988281 + }, + { + "episode": 21024, + "epoch": 0.37789840744868247, + "loss/policy_avg": 0.16946262121200562, + "lr": 2.748274539877301e-06, + "objective/entropy": 133.11578369140625, + "objective/kl": 9.329315185546875, + "objective/non_score_reward": -0.9329314827919006, + "objective/rlhf_reward": 0.6682741880416874, + "objective/scores": 1.1, + "policy/approxkl_avg": 12.421842575073242, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5224270224571228, + "step": 1313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0036096572875977 + }, + { + "episode": 21040, + "epoch": 0.3781860013660711, + "loss/policy_avg": 0.32205283641815186, + "lr": 2.7480828220858897e-06, + "objective/entropy": -96.55888366699219, + "objective/kl": 11.596528053283691, + "objective/non_score_reward": -1.1596527099609375, + "objective/rlhf_reward": -0.23861107826232875, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.196664810180664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5269286632537842, + "step": 1314, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993153810501099 + }, + { + "episode": 21056, + "epoch": 0.37847359528345975, + "loss/policy_avg": 0.060142651200294495, + "lr": 2.7478911042944785e-06, + "objective/entropy": 106.45033264160156, + "objective/kl": 9.987686157226562, + "objective/non_score_reward": -0.99876868724823, + "objective/rlhf_reward": -5.995074272155762, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.6358002424240112, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6303056478500366, + "step": 1315, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00164794921875 + }, + { + "episode": 21072, + "epoch": 0.3787611892008484, + "loss/policy_avg": 0.5722357034683228, + "lr": 2.7476993865030678e-06, + "objective/entropy": 36.441009521484375, + "objective/kl": 14.859726905822754, + "objective/non_score_reward": -1.4859726428985596, + "objective/rlhf_reward": -1.5438909776508805, + "objective/scores": 1.1, + "policy/approxkl_avg": 233.2808837890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.736956000328064, + "step": 1316, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973156452178955 + }, + { + "episode": 21088, + "epoch": 0.37904878311823703, + "loss/policy_avg": 0.14421963691711426, + "lr": 2.7475076687116566e-06, + "objective/entropy": -57.41680908203125, + "objective/kl": 15.156774520874023, + "objective/non_score_reward": -1.5156774520874023, + "objective/rlhf_reward": -8.06270980834961, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.829402923583984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.795988917350769, + "step": 1317, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0047948360443115 + }, + { + "episode": 21104, + "epoch": 0.37933637703562567, + "loss/policy_avg": 0.6621519327163696, + "lr": 2.7473159509202454e-06, + "objective/entropy": 89.67889404296875, + "objective/kl": 16.89303207397461, + "objective/non_score_reward": -1.689302921295166, + "objective/rlhf_reward": -4.634505877570186, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 178.07943725585938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8290786147117615, + "step": 1318, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0020265579223633 + }, + { + "episode": 21120, + "epoch": 0.37962397095301437, + "loss/policy_avg": 0.5616532564163208, + "lr": 2.7471242331288346e-06, + "objective/entropy": 228.52713012695312, + "objective/kl": 11.308792114257812, + "objective/non_score_reward": -1.1308794021606445, + "objective/rlhf_reward": -6.523517608642578, + "objective/scores": -0.5, + "policy/approxkl_avg": 72.7155532836914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7285025119781494, + "step": 1319, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004825592041016 + }, + { + "episode": 21136, + "epoch": 0.379911564870403, + "loss/policy_avg": 0.4463692903518677, + "lr": 2.7469325153374234e-06, + "objective/entropy": 33.118324279785156, + "objective/kl": 16.940200805664062, + "objective/non_score_reward": -1.6940197944641113, + "objective/rlhf_reward": -2.3760792374610897, + "objective/scores": 1.1, + "policy/approxkl_avg": 26.495033264160156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.708628237247467, + "step": 1320, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0003607273101807 + }, + { + "episode": 21152, + "epoch": 0.38019915878779165, + "loss/policy_avg": 0.033870283514261246, + "lr": 2.7467407975460127e-06, + "objective/entropy": 138.80300903320312, + "objective/kl": 18.500614166259766, + "objective/non_score_reward": -1.8500614166259766, + "objective/rlhf_reward": -9.400245666503906, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.69375991821289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6441357731819153, + "step": 1321, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995644092559814 + }, + { + "episode": 21168, + "epoch": 0.3804867527051803, + "loss/policy_avg": 0.3302859663963318, + "lr": 2.7465490797546015e-06, + "objective/entropy": 294.7465515136719, + "objective/kl": 21.378875732421875, + "objective/non_score_reward": -2.137887716293335, + "objective/rlhf_reward": -8.15155074596405, + "objective/scores": 0.1, + "policy/approxkl_avg": 118.91703796386719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9274790287017822, + "step": 1322, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9957250356674194 + }, + { + "episode": 21184, + "epoch": 0.3807743466225689, + "loss/policy_avg": 0.0874224305152893, + "lr": 2.7463573619631903e-06, + "objective/entropy": -143.14297485351562, + "objective/kl": 10.797538757324219, + "objective/non_score_reward": -1.0797538757324219, + "objective/rlhf_reward": -6.3190155029296875, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.874238014221191, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7316051125526428, + "step": 1323, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993560314178467 + }, + { + "episode": 21200, + "epoch": 0.38106194053995757, + "loss/policy_avg": 0.5921223163604736, + "lr": 2.746165644171779e-06, + "objective/entropy": 154.961181640625, + "objective/kl": 20.293582916259766, + "objective/non_score_reward": -2.029358386993408, + "objective/rlhf_reward": -10.117433547973633, + "objective/scores": -0.5, + "policy/approxkl_avg": 93.22239685058594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6468446254730225, + "step": 1324, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983704090118408 + }, + { + "episode": 21216, + "epoch": 0.3813495344573462, + "loss/policy_avg": 0.6764448881149292, + "lr": 2.745973926380368e-06, + "objective/entropy": 45.973533630371094, + "objective/kl": 15.151253700256348, + "objective/non_score_reward": -1.5151253938674927, + "objective/rlhf_reward": -8.060501098632812, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.96009063720703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7670845985412598, + "step": 1325, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982659816741943 + }, + { + "episode": 21232, + "epoch": 0.3816371283747349, + "loss/policy_avg": 0.3867417871952057, + "lr": 2.745782208588957e-06, + "objective/entropy": -59.67474365234375, + "objective/kl": 18.129138946533203, + "objective/non_score_reward": -1.8129138946533203, + "objective/rlhf_reward": -2.8516556978225704, + "objective/scores": 1.1, + "policy/approxkl_avg": 18.081439971923828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5321407318115234, + "step": 1326, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0017099380493164 + }, + { + "episode": 21248, + "epoch": 0.38192472229212354, + "loss/policy_avg": -0.04611814394593239, + "lr": 2.745590490797546e-06, + "objective/entropy": 61.81705856323242, + "objective/kl": 19.817153930664062, + "objective/non_score_reward": -1.981715440750122, + "objective/rlhf_reward": -7.526861643791198, + "objective/scores": 0.1, + "policy/approxkl_avg": 112.93687438964844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7623246908187866, + "step": 1327, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971072673797607 + }, + { + "episode": 21264, + "epoch": 0.3822123162095122, + "loss/policy_avg": 0.07851407676935196, + "lr": 2.7453987730061347e-06, + "objective/entropy": -244.60308837890625, + "objective/kl": 8.617375373840332, + "objective/non_score_reward": -0.8617374897003174, + "objective/rlhf_reward": -3.046949928998947, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.10788345336914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6370751857757568, + "step": 1328, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0000083446502686 + }, + { + "episode": 21280, + "epoch": 0.3824999101269008, + "loss/policy_avg": 0.13727867603302002, + "lr": 2.745207055214724e-06, + "objective/entropy": 233.06227111816406, + "objective/kl": 12.714912414550781, + "objective/non_score_reward": -1.271491289138794, + "objective/rlhf_reward": -4.685965275764465, + "objective/scores": 0.1, + "policy/approxkl_avg": 42.4720458984375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5807084441184998, + "step": 1329, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980757236480713 + }, + { + "episode": 21296, + "epoch": 0.38278750404428946, + "loss/policy_avg": 0.28195735812187195, + "lr": 2.7450153374233128e-06, + "objective/entropy": 45.676937103271484, + "objective/kl": 8.602317810058594, + "objective/non_score_reward": -0.8602317571640015, + "objective/rlhf_reward": -3.0409270286560055, + "objective/scores": 0.1, + "policy/approxkl_avg": 9.06165885925293, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5428619384765625, + "step": 1330, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992284774780273 + }, + { + "episode": 21312, + "epoch": 0.3830750979616781, + "loss/policy_avg": 0.47195160388946533, + "lr": 2.744823619631902e-06, + "objective/entropy": -45.77192687988281, + "objective/kl": 14.907909393310547, + "objective/non_score_reward": -1.490790843963623, + "objective/rlhf_reward": -7.963163375854492, + "objective/scores": -0.5, + "policy/approxkl_avg": 100.64746856689453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6922006011009216, + "step": 1331, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971425533294678 + }, + { + "episode": 21328, + "epoch": 0.38336269187906674, + "loss/policy_avg": 0.019790709018707275, + "lr": 2.744631901840491e-06, + "objective/entropy": 102.41281127929688, + "objective/kl": 8.887635231018066, + "objective/non_score_reward": -0.8887635469436646, + "objective/rlhf_reward": -3.1550543963909146, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.5302138328552246, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8031924962997437, + "step": 1332, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0033862590789795 + }, + { + "episode": 21344, + "epoch": 0.3836502857964554, + "loss/policy_avg": 0.20752736926078796, + "lr": 2.7444401840490796e-06, + "objective/entropy": 373.9801940917969, + "objective/kl": 22.242820739746094, + "objective/non_score_reward": -2.2242817878723145, + "objective/rlhf_reward": -10.897127151489258, + "objective/scores": -0.5, + "policy/approxkl_avg": 39.84983825683594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9056296348571777, + "step": 1333, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998758316040039 + }, + { + "episode": 21360, + "epoch": 0.3839378797138441, + "loss/policy_avg": 0.4480108618736267, + "lr": 2.744248466257669e-06, + "objective/entropy": -94.64065551757812, + "objective/kl": 17.10127067565918, + "objective/non_score_reward": -1.7101271152496338, + "objective/rlhf_reward": -5.1786488346463315, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 49.86717224121094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7233811616897583, + "step": 1334, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996772289276123 + }, + { + "episode": 21376, + "epoch": 0.3842254736312327, + "loss/policy_avg": 0.5475999712944031, + "lr": 2.7440567484662577e-06, + "objective/entropy": 106.49105834960938, + "objective/kl": 11.232261657714844, + "objective/non_score_reward": -1.1232261657714844, + "objective/rlhf_reward": -6.4929046630859375, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.136554718017578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7535625696182251, + "step": 1335, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989811182022095 + }, + { + "episode": 21392, + "epoch": 0.38451306754862136, + "loss/policy_avg": 0.2780382037162781, + "lr": 2.743865030674847e-06, + "objective/entropy": 141.95111083984375, + "objective/kl": 15.009653091430664, + "objective/non_score_reward": -1.5009653568267822, + "objective/rlhf_reward": -1.6038616657257077, + "objective/scores": 1.1, + "policy/approxkl_avg": 40.69712829589844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46323397755622864, + "step": 1336, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997363805770874 + }, + { + "episode": 21408, + "epoch": 0.38480066146601, + "loss/policy_avg": 0.5840069055557251, + "lr": 2.7436733128834357e-06, + "objective/entropy": 66.1385726928711, + "objective/kl": 12.383220672607422, + "objective/non_score_reward": -1.2383220195770264, + "objective/rlhf_reward": -4.55328813791275, + "objective/scores": 0.1, + "policy/approxkl_avg": 9.937700271606445, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.67143714427948, + "step": 1337, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998342990875244 + }, + { + "episode": 21424, + "epoch": 0.38508825538339864, + "loss/policy_avg": 0.47393912076950073, + "lr": 2.7434815950920245e-06, + "objective/entropy": 149.76553344726562, + "objective/kl": 22.620647430419922, + "objective/non_score_reward": -2.2620649337768555, + "objective/rlhf_reward": -8.648259377479553, + "objective/scores": 0.1, + "policy/approxkl_avg": 125.28185272216797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47826266288757324, + "step": 1338, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998018741607666 + }, + { + "episode": 21440, + "epoch": 0.3853758493007873, + "loss/policy_avg": 0.561253011226654, + "lr": 2.7432898773006138e-06, + "objective/entropy": 165.69529724121094, + "objective/kl": 16.98709487915039, + "objective/non_score_reward": -1.6987093687057495, + "objective/rlhf_reward": -8.794837951660156, + "objective/scores": -0.5, + "policy/approxkl_avg": 104.10726928710938, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7110261917114258, + "step": 1339, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979422092437744 + }, + { + "episode": 21456, + "epoch": 0.3856634432181759, + "loss/policy_avg": 0.08664319664239883, + "lr": 2.7430981595092026e-06, + "objective/entropy": 135.85597229003906, + "objective/kl": 13.391534805297852, + "objective/non_score_reward": -1.3391534090042114, + "objective/rlhf_reward": -4.956613807380199, + "objective/scores": 0.1, + "policy/approxkl_avg": 53.051788330078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5849297046661377, + "step": 1340, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986178874969482 + }, + { + "episode": 21472, + "epoch": 0.38595103713556456, + "loss/policy_avg": 0.07530111074447632, + "lr": 2.7429064417177914e-06, + "objective/entropy": 94.53955078125, + "objective/kl": 9.170591354370117, + "objective/non_score_reward": -0.9170591831207275, + "objective/rlhf_reward": 0.7317632079124454, + "objective/scores": 1.1, + "policy/approxkl_avg": 5.455111980438232, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5276771783828735, + "step": 1341, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987722635269165 + }, + { + "episode": 21488, + "epoch": 0.38623863105295325, + "loss/policy_avg": -0.18977919220924377, + "lr": 2.7427147239263806e-06, + "objective/entropy": 84.14942932128906, + "objective/kl": 13.785886764526367, + "objective/non_score_reward": -1.3785889148712158, + "objective/rlhf_reward": -1.1143553912639614, + "objective/scores": 1.1, + "policy/approxkl_avg": 18.839641571044922, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7620354294776917, + "step": 1342, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.005643129348755 + }, + { + "episode": 21504, + "epoch": 0.3865262249703419, + "loss/policy_avg": 0.051979582756757736, + "lr": 2.7425230061349694e-06, + "objective/entropy": 144.39056396484375, + "objective/kl": 13.885366439819336, + "objective/non_score_reward": -1.3885365724563599, + "objective/rlhf_reward": -2.6304271861326427, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 29.781160354614258, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3581441342830658, + "step": 1343, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010387897491455 + }, + { + "episode": 21520, + "epoch": 0.38681381888773053, + "loss/policy_avg": 0.032890960574150085, + "lr": 2.7423312883435587e-06, + "objective/entropy": 148.615478515625, + "objective/kl": 7.770719051361084, + "objective/non_score_reward": -0.7770719528198242, + "objective/rlhf_reward": -5.108287811279297, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.95543098449707, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6966961026191711, + "step": 1344, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0212697982788086 + }, + { + "episode": 21536, + "epoch": 0.38710141280511917, + "loss/policy_avg": 0.13797160983085632, + "lr": 2.742139570552147e-06, + "objective/entropy": 287.2220764160156, + "objective/kl": 19.492271423339844, + "objective/non_score_reward": -1.9492273330688477, + "objective/rlhf_reward": -3.396909093856811, + "objective/scores": 1.1, + "policy/approxkl_avg": 48.7330207824707, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.713855504989624, + "step": 1345, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992468357086182 + }, + { + "episode": 21552, + "epoch": 0.3873890067225078, + "loss/policy_avg": 0.26439833641052246, + "lr": 2.7419478527607363e-06, + "objective/entropy": 21.02935791015625, + "objective/kl": 14.434762954711914, + "objective/non_score_reward": -1.4434764385223389, + "objective/rlhf_reward": -7.7739057540893555, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.537498474121094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7992854118347168, + "step": 1346, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9975281953811646 + }, + { + "episode": 21568, + "epoch": 0.38767660063989645, + "loss/policy_avg": 0.5610612034797668, + "lr": 2.741756134969325e-06, + "objective/entropy": 51.866172790527344, + "objective/kl": 14.561103820800781, + "objective/non_score_reward": -1.4561104774475098, + "objective/rlhf_reward": -5.42444167137146, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.282157897949219, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.777137279510498, + "step": 1347, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995806217193604 + }, + { + "episode": 21584, + "epoch": 0.3879641945572851, + "loss/policy_avg": 0.2028767466545105, + "lr": 2.741564417177914e-06, + "objective/entropy": 42.17268371582031, + "objective/kl": 16.39310073852539, + "objective/non_score_reward": -1.6393101215362549, + "objective/rlhf_reward": -4.609829376416142, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 51.239471435546875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7365516424179077, + "step": 1348, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000267505645752 + }, + { + "episode": 21600, + "epoch": 0.3882517884746738, + "loss/policy_avg": 0.13323874771595, + "lr": 2.741372699386503e-06, + "objective/entropy": 181.5269012451172, + "objective/kl": 8.048508644104004, + "objective/non_score_reward": -0.8048508167266846, + "objective/rlhf_reward": -2.819403341412544, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.953498363494873, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7375788688659668, + "step": 1349, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000728130340576 + }, + { + "episode": 21616, + "epoch": 0.3885393823920624, + "loss/policy_avg": 0.32980287075042725, + "lr": 2.741180981595092e-06, + "objective/entropy": -87.61666107177734, + "objective/kl": 13.81655216217041, + "objective/non_score_reward": -1.3816550970077515, + "objective/rlhf_reward": -3.701791579994272, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 58.40357971191406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8026307821273804, + "step": 1350, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001197576522827 + }, + { + "episode": 21632, + "epoch": 0.38882697630945107, + "loss/policy_avg": 1.0421653985977173, + "lr": 2.740989263803681e-06, + "objective/entropy": 295.99005126953125, + "objective/kl": 13.983240127563477, + "objective/non_score_reward": -1.3983240127563477, + "objective/rlhf_reward": -3.193296259641647, + "objective/scores": 0.6, + "policy/approxkl_avg": 161.0973358154297, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6929864883422852, + "step": 1351, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990979433059692 + }, + { + "episode": 21648, + "epoch": 0.3891145702268397, + "loss/policy_avg": -0.03795725852251053, + "lr": 2.74079754601227e-06, + "objective/entropy": 205.16946411132812, + "objective/kl": 16.07244873046875, + "objective/non_score_reward": -1.607244610786438, + "objective/rlhf_reward": -6.028978353738784, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.50408172607422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8304120302200317, + "step": 1352, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998217821121216 + }, + { + "episode": 21664, + "epoch": 0.38940216414422835, + "loss/policy_avg": 0.558219850063324, + "lr": 2.740605828220859e-06, + "objective/entropy": -41.869117736816406, + "objective/kl": 12.831666946411133, + "objective/non_score_reward": -1.2831666469573975, + "objective/rlhf_reward": -4.732666528224945, + "objective/scores": 0.1, + "policy/approxkl_avg": 51.029197692871094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8420255780220032, + "step": 1353, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978086948394775 + }, + { + "episode": 21680, + "epoch": 0.389689758061617, + "loss/policy_avg": 0.07326959073543549, + "lr": 2.740414110429448e-06, + "objective/entropy": 153.06631469726562, + "objective/kl": 15.971595764160156, + "objective/non_score_reward": -1.597159743309021, + "objective/rlhf_reward": -3.4649201377641887, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 31.896522521972656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7210527658462524, + "step": 1354, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981770515441895 + }, + { + "episode": 21696, + "epoch": 0.3899773519790056, + "loss/policy_avg": 0.4767560064792633, + "lr": 2.740222392638037e-06, + "objective/entropy": 231.67868041992188, + "objective/kl": 16.846141815185547, + "objective/non_score_reward": -1.6846144199371338, + "objective/rlhf_reward": -5.005124107996622, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 137.12591552734375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.764703094959259, + "step": 1355, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972622394561768 + }, + { + "episode": 21712, + "epoch": 0.39026494589639427, + "loss/policy_avg": 0.11291810870170593, + "lr": 2.7400306748466256e-06, + "objective/entropy": 142.6898956298828, + "objective/kl": 17.51624298095703, + "objective/non_score_reward": -1.751624345779419, + "objective/rlhf_reward": -6.606497681140899, + "objective/scores": 0.1, + "policy/approxkl_avg": 38.98486328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6645491123199463, + "step": 1356, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9972184896469116 + }, + { + "episode": 21728, + "epoch": 0.39055253981378296, + "loss/policy_avg": 0.4059602618217468, + "lr": 2.739838957055215e-06, + "objective/entropy": 133.1064453125, + "objective/kl": 14.388803482055664, + "objective/non_score_reward": -1.438880443572998, + "objective/rlhf_reward": -2.8318024619829387, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 34.9724006652832, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7247166633605957, + "step": 1357, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998490810394287 + }, + { + "episode": 21744, + "epoch": 0.3908401337311716, + "loss/policy_avg": 0.1599498689174652, + "lr": 2.7396472392638037e-06, + "objective/entropy": 46.026268005371094, + "objective/kl": 7.916892051696777, + "objective/non_score_reward": -0.7916892766952515, + "objective/rlhf_reward": -5.166757106781006, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.668688774108887, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6974193453788757, + "step": 1358, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0124077796936035 + }, + { + "episode": 21760, + "epoch": 0.39112772764856024, + "loss/policy_avg": 0.11326786130666733, + "lr": 2.739455521472393e-06, + "objective/entropy": 139.10223388671875, + "objective/kl": 18.496702194213867, + "objective/non_score_reward": -1.84967041015625, + "objective/rlhf_reward": -2.9986812531948086, + "objective/scores": 1.1, + "policy/approxkl_avg": 94.51484680175781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8132376670837402, + "step": 1359, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987239837646484 + }, + { + "episode": 21776, + "epoch": 0.3914153215659489, + "loss/policy_avg": 0.12141874432563782, + "lr": 2.7392638036809817e-06, + "objective/entropy": 65.53262329101562, + "objective/kl": 17.54939842224121, + "objective/non_score_reward": -1.7549399137496948, + "objective/rlhf_reward": -6.619759654998779, + "objective/scores": 0.1, + "policy/approxkl_avg": 71.7486572265625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6637197732925415, + "step": 1360, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0016891956329346 + }, + { + "episode": 21792, + "epoch": 0.3917029154833375, + "loss/policy_avg": 0.07374750077724457, + "lr": 2.7390720858895705e-06, + "objective/entropy": -212.7354736328125, + "objective/kl": 8.046335220336914, + "objective/non_score_reward": -0.8046334981918335, + "objective/rlhf_reward": -2.818534111976623, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.865436553955078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7911792993545532, + "step": 1361, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985687732696533 + }, + { + "episode": 21808, + "epoch": 0.39199050940072616, + "loss/policy_avg": 0.7190690040588379, + "lr": 2.7388803680981598e-06, + "objective/entropy": 244.15914916992188, + "objective/kl": 7.830224990844727, + "objective/non_score_reward": -0.7830224633216858, + "objective/rlhf_reward": -1.1846786839532215, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 8.115243911743164, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6153068542480469, + "step": 1362, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000927209854126 + }, + { + "episode": 21824, + "epoch": 0.3922781033181148, + "loss/policy_avg": 0.2109440267086029, + "lr": 2.7386886503067486e-06, + "objective/entropy": -58.729270935058594, + "objective/kl": 21.156822204589844, + "objective/non_score_reward": -2.1156821250915527, + "objective/rlhf_reward": -4.062729096412658, + "objective/scores": 1.1, + "policy/approxkl_avg": 108.97077941894531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6496853828430176, + "step": 1363, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0006513595581055 + }, + { + "episode": 21840, + "epoch": 0.3925656972355035, + "loss/policy_avg": -0.1141444593667984, + "lr": 2.738496932515338e-06, + "objective/entropy": 30.81436538696289, + "objective/kl": 17.293773651123047, + "objective/non_score_reward": -1.7293777465820312, + "objective/rlhf_reward": -5.09268179831178, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 40.24468994140625, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5824557542800903, + "step": 1364, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977633953094482 + }, + { + "episode": 21856, + "epoch": 0.39285329115289214, + "loss/policy_avg": 0.20132741332054138, + "lr": 2.7383052147239266e-06, + "objective/entropy": 103.67611694335938, + "objective/kl": 13.289691925048828, + "objective/non_score_reward": -1.3289692401885986, + "objective/rlhf_reward": -4.915877020359039, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.196710586547852, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.62409508228302, + "step": 1365, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994707107543945 + }, + { + "episode": 21872, + "epoch": 0.3931408850702808, + "loss/policy_avg": 0.20989450812339783, + "lr": 2.7381134969325154e-06, + "objective/entropy": 99.08970642089844, + "objective/kl": 14.397649765014648, + "objective/non_score_reward": -1.4397649765014648, + "objective/rlhf_reward": -5.359060144424438, + "objective/scores": 0.1, + "policy/approxkl_avg": 47.96294403076172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7722165584564209, + "step": 1366, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997646689414978 + }, + { + "episode": 21888, + "epoch": 0.3934284789876694, + "loss/policy_avg": 0.17023280262947083, + "lr": 2.7379217791411042e-06, + "objective/entropy": 146.130615234375, + "objective/kl": 14.226318359375, + "objective/non_score_reward": -1.422631859779358, + "objective/rlhf_reward": -4.134268193450525, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 30.81444549560547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5660938024520874, + "step": 1367, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002405643463135 + }, + { + "episode": 21904, + "epoch": 0.39371607290505806, + "loss/policy_avg": 0.38309454917907715, + "lr": 2.737730061349693e-06, + "objective/entropy": 90.76017761230469, + "objective/kl": 13.75072956085205, + "objective/non_score_reward": -1.375072956085205, + "objective/rlhf_reward": -1.1002921894192692, + "objective/scores": 1.1, + "policy/approxkl_avg": 34.77094268798828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8311518430709839, + "step": 1368, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0005173683166504 + }, + { + "episode": 21920, + "epoch": 0.3940036668224467, + "loss/policy_avg": 0.24230515956878662, + "lr": 2.7375383435582823e-06, + "objective/entropy": 248.60150146484375, + "objective/kl": 15.170378684997559, + "objective/non_score_reward": -1.5170379877090454, + "objective/rlhf_reward": -3.1444328769457073, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.941917657852173, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6488876938819885, + "step": 1369, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001039743423462 + }, + { + "episode": 21936, + "epoch": 0.39429126073983534, + "loss/policy_avg": 1.0242424011230469, + "lr": 2.737346625766871e-06, + "objective/entropy": 74.42791748046875, + "objective/kl": 14.777740478515625, + "objective/non_score_reward": -1.4777741432189941, + "objective/rlhf_reward": -3.5110965132713314, + "objective/scores": 0.6, + "policy/approxkl_avg": 65.65885162353516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.595373272895813, + "step": 1370, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9968748092651367 + }, + { + "episode": 21952, + "epoch": 0.394578854657224, + "loss/policy_avg": 0.04835711419582367, + "lr": 2.73715490797546e-06, + "objective/entropy": 134.174560546875, + "objective/kl": 13.336694717407227, + "objective/non_score_reward": -1.3336694240570068, + "objective/rlhf_reward": -4.9346778154373165, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.234766006469727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7185621857643127, + "step": 1371, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0065627098083496 + }, + { + "episode": 21968, + "epoch": 0.39486644857461267, + "loss/policy_avg": 0.2938240170478821, + "lr": 2.736963190184049e-06, + "objective/entropy": -30.205530166625977, + "objective/kl": 17.3317928314209, + "objective/non_score_reward": -1.7331793308258057, + "objective/rlhf_reward": -5.1993836919466645, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 119.18181610107422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6755663156509399, + "step": 1372, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999345302581787 + }, + { + "episode": 21984, + "epoch": 0.3951540424920013, + "loss/policy_avg": 0.3882046937942505, + "lr": 2.736771472392638e-06, + "objective/entropy": 97.84891510009766, + "objective/kl": 13.398181915283203, + "objective/non_score_reward": -1.3398182392120361, + "objective/rlhf_reward": -3.4118618769215896, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 45.98151779174805, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5983450412750244, + "step": 1373, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974939823150635 + }, + { + "episode": 22000, + "epoch": 0.39544163640938995, + "loss/policy_avg": 0.3626946210861206, + "lr": 2.736579754601227e-06, + "objective/entropy": 151.31686401367188, + "objective/kl": 14.419286727905273, + "objective/non_score_reward": -1.4419286251068115, + "objective/rlhf_reward": -1.3677145004272457, + "objective/scores": 1.1, + "policy/approxkl_avg": 9.240350723266602, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4013226628303528, + "step": 1374, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00012469291687 + }, + { + "episode": 22016, + "epoch": 0.3957292303267786, + "loss/policy_avg": -0.1479598432779312, + "lr": 2.736388036809816e-06, + "objective/entropy": -0.02170562744140625, + "objective/kl": 12.707273483276367, + "objective/non_score_reward": -1.2707273960113525, + "objective/rlhf_reward": -2.1591904580008716, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 17.358325958251953, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6573648452758789, + "step": 1375, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988689422607422 + }, + { + "episode": 22032, + "epoch": 0.39601682424416723, + "loss/policy_avg": -0.3523472249507904, + "lr": 2.736196319018405e-06, + "objective/entropy": 113.51665496826172, + "objective/kl": 12.600326538085938, + "objective/non_score_reward": -1.2600325345993042, + "objective/rlhf_reward": -2.1164110049020977, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.17249298095703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5286760926246643, + "step": 1376, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002790927886963 + }, + { + "episode": 22048, + "epoch": 0.39630441816155587, + "loss/policy_avg": 0.06261734664440155, + "lr": 2.736004601226994e-06, + "objective/entropy": 236.24249267578125, + "objective/kl": 8.30074691772461, + "objective/non_score_reward": -0.830074667930603, + "objective/rlhf_reward": -2.920298492908478, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.030731439590454, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.586647629737854, + "step": 1377, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0025887489318848 + }, + { + "episode": 22064, + "epoch": 0.3965920120789445, + "loss/policy_avg": -0.276256799697876, + "lr": 2.735812883435583e-06, + "objective/entropy": 147.0058135986328, + "objective/kl": 14.008570671081543, + "objective/non_score_reward": -1.4008569717407227, + "objective/rlhf_reward": -3.203427946567535, + "objective/scores": 0.6, + "policy/approxkl_avg": 46.215850830078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6285009980201721, + "step": 1378, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.008394479751587 + }, + { + "episode": 22080, + "epoch": 0.39687960599633315, + "loss/policy_avg": 0.13082417845726013, + "lr": 2.7356211656441717e-06, + "objective/entropy": -202.0, + "objective/kl": 11.788710594177246, + "objective/non_score_reward": -1.1788710355758667, + "objective/rlhf_reward": -0.31548414230346644, + "objective/scores": 1.1, + "policy/approxkl_avg": 3.524956703186035, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8542511463165283, + "step": 1379, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9987741708755493 + }, + { + "episode": 22096, + "epoch": 0.39716719991372185, + "loss/policy_avg": 0.095906101167202, + "lr": 2.735429447852761e-06, + "objective/entropy": 80.37876892089844, + "objective/kl": 14.221534729003906, + "objective/non_score_reward": -1.4221534729003906, + "objective/rlhf_reward": -5.288614249229431, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.4239659309387207, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5438545942306519, + "step": 1380, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001100778579712 + }, + { + "episode": 22112, + "epoch": 0.3974547938311105, + "loss/policy_avg": 0.7796217203140259, + "lr": 2.7352377300613497e-06, + "objective/entropy": -26.782432556152344, + "objective/kl": 10.952417373657227, + "objective/non_score_reward": -1.0952417850494385, + "objective/rlhf_reward": -1.4572480961096015, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 52.988441467285156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8282052278518677, + "step": 1381, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9960274696350098 + }, + { + "episode": 22128, + "epoch": 0.3977423877484991, + "loss/policy_avg": 0.33992427587509155, + "lr": 2.735046012269939e-06, + "objective/entropy": 25.139671325683594, + "objective/kl": 12.910887718200684, + "objective/non_score_reward": -1.2910888195037842, + "objective/rlhf_reward": -7.164355278015137, + "objective/scores": -0.5, + "policy/approxkl_avg": 67.68551635742188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.633919894695282, + "step": 1382, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0026700496673584 + }, + { + "episode": 22144, + "epoch": 0.39802998166588777, + "loss/policy_avg": 0.20044741034507751, + "lr": 2.7348542944785277e-06, + "objective/entropy": -16.74477767944336, + "objective/kl": 15.490436553955078, + "objective/non_score_reward": -1.5490437746047974, + "objective/rlhf_reward": -5.796175158023834, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.735712051391602, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.637913703918457, + "step": 1383, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990465641021729 + }, + { + "episode": 22160, + "epoch": 0.3983175755832764, + "loss/policy_avg": -0.0019893571734428406, + "lr": 2.7346625766871165e-06, + "objective/entropy": -15.93057632446289, + "objective/kl": 14.116677284240723, + "objective/non_score_reward": -1.4116679430007935, + "objective/rlhf_reward": -7.646671772003174, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.67679214477539, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7569680213928223, + "step": 1384, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990018606185913 + }, + { + "episode": 22176, + "epoch": 0.39860516950066505, + "loss/policy_avg": 0.8097996115684509, + "lr": 2.7344708588957058e-06, + "objective/entropy": 65.41546630859375, + "objective/kl": 14.531415939331055, + "objective/non_score_reward": -1.453141689300537, + "objective/rlhf_reward": -7.812566757202148, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.36127853393555, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6726914048194885, + "step": 1385, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987540245056152 + }, + { + "episode": 22192, + "epoch": 0.3988927634180537, + "loss/policy_avg": 0.2525538206100464, + "lr": 2.7342791411042946e-06, + "objective/entropy": -36.927734375, + "objective/kl": 15.043611526489258, + "objective/non_score_reward": -1.5043611526489258, + "objective/rlhf_reward": -8.017444610595703, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.599006652832031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.804946780204773, + "step": 1386, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9973516464233398 + }, + { + "episode": 22208, + "epoch": 0.3991803573354424, + "loss/policy_avg": 0.04792652279138565, + "lr": 2.734087423312884e-06, + "objective/entropy": 257.07891845703125, + "objective/kl": 17.709779739379883, + "objective/non_score_reward": -1.7709778547286987, + "objective/rlhf_reward": -6.683911597728729, + "objective/scores": 0.1, + "policy/approxkl_avg": 89.76461029052734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6373671889305115, + "step": 1387, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984499216079712 + }, + { + "episode": 22224, + "epoch": 0.399467951252831, + "loss/policy_avg": 0.22276929020881653, + "lr": 2.7338957055214726e-06, + "objective/entropy": 262.33367919921875, + "objective/kl": 20.37179946899414, + "objective/non_score_reward": -2.037179946899414, + "objective/rlhf_reward": -3.748719549179077, + "objective/scores": 1.1, + "policy/approxkl_avg": 57.362403869628906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7482947707176208, + "step": 1388, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991297721862793 + }, + { + "episode": 22240, + "epoch": 0.39975554517021966, + "loss/policy_avg": 0.013198129832744598, + "lr": 2.7337039877300614e-06, + "objective/entropy": 56.792144775390625, + "objective/kl": 17.05535888671875, + "objective/non_score_reward": -1.7055360078811646, + "objective/rlhf_reward": -3.898425136448118, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.213428258895874, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5146496295928955, + "step": 1389, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000297784805298 + }, + { + "episode": 22256, + "epoch": 0.4000431390876083, + "loss/policy_avg": 0.7818821668624878, + "lr": 2.7335122699386503e-06, + "objective/entropy": 14.627386093139648, + "objective/kl": 11.377643585205078, + "objective/non_score_reward": -1.1377642154693604, + "objective/rlhf_reward": -1.6273382350218024, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 53.005706787109375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6652359962463379, + "step": 1390, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997227668762207 + }, + { + "episode": 22272, + "epoch": 0.40033073300499694, + "loss/policy_avg": 0.6206638813018799, + "lr": 2.733320552147239e-06, + "objective/entropy": 60.79871368408203, + "objective/kl": 15.99708366394043, + "objective/non_score_reward": -1.5997084379196167, + "objective/rlhf_reward": -5.998833811283111, + "objective/scores": 0.1, + "policy/approxkl_avg": 117.21906280517578, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6628750562667847, + "step": 1391, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984533786773682 + }, + { + "episode": 22288, + "epoch": 0.4006183269223856, + "loss/policy_avg": -0.016587669029831886, + "lr": 2.7331288343558283e-06, + "objective/entropy": -194.74586486816406, + "objective/kl": 10.784317016601562, + "objective/non_score_reward": -1.0784317255020142, + "objective/rlhf_reward": -2.757467686143473, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 32.546348571777344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7876214385032654, + "step": 1392, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989862442016602 + }, + { + "episode": 22304, + "epoch": 0.4009059208397742, + "loss/policy_avg": 0.2942638099193573, + "lr": 2.732937116564417e-06, + "objective/entropy": -21.264427185058594, + "objective/kl": 14.427265167236328, + "objective/non_score_reward": -1.4427263736724854, + "objective/rlhf_reward": -3.3709056735038754, + "objective/scores": 0.6, + "policy/approxkl_avg": 108.87052154541016, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8786087036132812, + "step": 1393, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996511459350586 + }, + { + "episode": 22320, + "epoch": 0.40119351475716286, + "loss/policy_avg": 0.4861958622932434, + "lr": 2.732745398773006e-06, + "objective/entropy": -2.8598480224609375, + "objective/kl": 17.84203338623047, + "objective/non_score_reward": -1.784203290939331, + "objective/rlhf_reward": -6.736813253164291, + "objective/scores": 0.1, + "policy/approxkl_avg": 70.02932739257812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.599122166633606, + "step": 1394, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993877410888672 + }, + { + "episode": 22336, + "epoch": 0.40148110867455156, + "loss/policy_avg": 0.0992339476943016, + "lr": 2.732553680981595e-06, + "objective/entropy": -72.45336151123047, + "objective/kl": 12.332319259643555, + "objective/non_score_reward": -1.2332319021224976, + "objective/rlhf_reward": -2.532927787303924, + "objective/scores": 0.6, + "policy/approxkl_avg": 45.31598663330078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7159440517425537, + "step": 1395, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985222816467285 + }, + { + "episode": 22352, + "epoch": 0.4017687025919402, + "loss/policy_avg": -0.0790301039814949, + "lr": 2.732361963190184e-06, + "objective/entropy": 32.95172119140625, + "objective/kl": 16.841075897216797, + "objective/non_score_reward": -1.6841075420379639, + "objective/rlhf_reward": -6.336429929733276, + "objective/scores": 0.1, + "policy/approxkl_avg": 49.57403564453125, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7559831142425537, + "step": 1396, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001506805419922 + }, + { + "episode": 22368, + "epoch": 0.40205629650932884, + "loss/policy_avg": 0.03451812267303467, + "lr": 2.732170245398773e-06, + "objective/entropy": -18.881187438964844, + "objective/kl": 15.064776420593262, + "objective/non_score_reward": -1.5064775943756104, + "objective/rlhf_reward": -8.025910377502441, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.9893798828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4877445101737976, + "step": 1397, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996527433395386 + }, + { + "episode": 22384, + "epoch": 0.4023438904267175, + "loss/policy_avg": 0.05204878747463226, + "lr": 2.731978527607362e-06, + "objective/entropy": 200.4969482421875, + "objective/kl": 14.600479125976562, + "objective/non_score_reward": -1.460047960281372, + "objective/rlhf_reward": -7.840191841125488, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.002072334289551, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7866439819335938, + "step": 1398, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002009868621826 + }, + { + "episode": 22400, + "epoch": 0.4026314843441061, + "loss/policy_avg": -0.12847991287708282, + "lr": 2.731786809815951e-06, + "objective/entropy": 79.1689224243164, + "objective/kl": 18.852502822875977, + "objective/non_score_reward": -1.8852503299713135, + "objective/rlhf_reward": -7.141001439094543, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.583232879638672, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.833438515663147, + "step": 1399, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000863552093506 + }, + { + "episode": 22416, + "epoch": 0.40291907826149476, + "loss/policy_avg": 0.2065950632095337, + "lr": 2.73159509202454e-06, + "objective/entropy": 3.9622802734375, + "objective/kl": 11.335432052612305, + "objective/non_score_reward": -1.1335433721542358, + "objective/rlhf_reward": -6.534173488616943, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.01820755004883, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.81622713804245, + "step": 1400, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971067905426025 + }, + { + "episode": 22432, + "epoch": 0.4032066721788834, + "loss/policy_avg": 0.13497845828533173, + "lr": 2.731403374233129e-06, + "objective/entropy": -37.79541015625, + "objective/kl": 17.951873779296875, + "objective/non_score_reward": -1.7951874732971191, + "objective/rlhf_reward": -9.180749893188477, + "objective/scores": -0.5, + "policy/approxkl_avg": 58.593788146972656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5796679854393005, + "step": 1401, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996742844581604 + }, + { + "episode": 22448, + "epoch": 0.40349426609627204, + "loss/policy_avg": 0.27193281054496765, + "lr": 2.731211656441718e-06, + "objective/entropy": 174.13641357421875, + "objective/kl": 13.39457893371582, + "objective/non_score_reward": -1.3394579887390137, + "objective/rlhf_reward": -3.2351258418717723, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 26.27016830444336, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7780320644378662, + "step": 1402, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996938705444336 + }, + { + "episode": 22464, + "epoch": 0.40378186001366073, + "loss/policy_avg": 2.09384822845459, + "lr": 2.731019938650307e-06, + "objective/entropy": -58.64219665527344, + "objective/kl": 10.139740943908691, + "objective/non_score_reward": -1.0139741897583008, + "objective/rlhf_reward": -2.451776567761021, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 29.046207427978516, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6864455938339233, + "step": 1403, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0002877712249756 + }, + { + "episode": 22480, + "epoch": 0.40406945393104937, + "loss/policy_avg": -0.19483403861522675, + "lr": 2.7308282208588957e-06, + "objective/entropy": -197.337158203125, + "objective/kl": 15.854147911071777, + "objective/non_score_reward": -1.5854146480560303, + "objective/rlhf_reward": -5.94165871143341, + "objective/scores": 0.1, + "policy/approxkl_avg": 154.90176391601562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47332537174224854, + "step": 1404, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990125894546509 + }, + { + "episode": 22496, + "epoch": 0.404357047848438, + "loss/policy_avg": 0.23371820151805878, + "lr": 2.730636503067485e-06, + "objective/entropy": 301.93804931640625, + "objective/kl": 21.32901382446289, + "objective/non_score_reward": -2.132901668548584, + "objective/rlhf_reward": -10.531606674194336, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.739236831665039, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.8294570446014404, + "step": 1405, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999563217163086 + }, + { + "episode": 22512, + "epoch": 0.40464464176582665, + "loss/policy_avg": 0.16459359228610992, + "lr": 2.7304447852760737e-06, + "objective/entropy": 308.935791015625, + "objective/kl": 19.416046142578125, + "objective/non_score_reward": -1.941604733467102, + "objective/rlhf_reward": -9.76641845703125, + "objective/scores": -0.5, + "policy/approxkl_avg": 69.8115234375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.786815881729126, + "step": 1406, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998669147491455 + }, + { + "episode": 22528, + "epoch": 0.4049322356832153, + "loss/policy_avg": 0.26910632848739624, + "lr": 2.7302530674846626e-06, + "objective/entropy": 21.283084869384766, + "objective/kl": 11.796088218688965, + "objective/non_score_reward": -1.1796088218688965, + "objective/rlhf_reward": -3.0565758399373157, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 6.656482219696045, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6182963848114014, + "step": 1407, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9966087341308594 + }, + { + "episode": 22544, + "epoch": 0.40521982960060393, + "loss/policy_avg": 0.7190654873847961, + "lr": 2.7300613496932518e-06, + "objective/entropy": -6.668407440185547, + "objective/kl": 18.76657485961914, + "objective/non_score_reward": -1.8766577243804932, + "objective/rlhf_reward": -7.106630867719651, + "objective/scores": 0.1, + "policy/approxkl_avg": 211.33731079101562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6118903756141663, + "step": 1408, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998753547668457 + }, + { + "episode": 22560, + "epoch": 0.40550742351799257, + "loss/policy_avg": 1.4786778688430786, + "lr": 2.7298696319018406e-06, + "objective/entropy": -25.83483123779297, + "objective/kl": 18.464750289916992, + "objective/non_score_reward": -1.8464751243591309, + "objective/rlhf_reward": -6.985900437831878, + "objective/scores": 0.1, + "policy/approxkl_avg": 110.1044692993164, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5393742322921753, + "step": 1409, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991658926010132 + }, + { + "episode": 22576, + "epoch": 0.40579501743538127, + "loss/policy_avg": 0.15572980046272278, + "lr": 2.72967791411043e-06, + "objective/entropy": -167.088134765625, + "objective/kl": 13.024639129638672, + "objective/non_score_reward": -1.3024640083312988, + "objective/rlhf_reward": -2.286137257457945, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 51.253448486328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6638354659080505, + "step": 1410, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000250816345215 + }, + { + "episode": 22592, + "epoch": 0.4060826113527699, + "loss/policy_avg": -0.009990356862545013, + "lr": 2.7294861963190186e-06, + "objective/entropy": -264.7129821777344, + "objective/kl": 14.578704833984375, + "objective/non_score_reward": -1.4578704833984375, + "objective/rlhf_reward": -5.431481993198394, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.5859309434890747, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7239521741867065, + "step": 1411, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0019497871398926 + }, + { + "episode": 22608, + "epoch": 0.40637020527015855, + "loss/policy_avg": 0.5303527116775513, + "lr": 2.7292944785276074e-06, + "objective/entropy": -19.109874725341797, + "objective/kl": 13.917287826538086, + "objective/non_score_reward": -1.3917287588119507, + "objective/rlhf_reward": -5.166914796829223, + "objective/scores": 0.1, + "policy/approxkl_avg": 28.013973236083984, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.47340190410614014, + "step": 1412, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0010433197021484 + }, + { + "episode": 22624, + "epoch": 0.4066577991875472, + "loss/policy_avg": 0.1528145968914032, + "lr": 2.7291027607361963e-06, + "objective/entropy": 87.03872680664062, + "objective/kl": 13.098251342773438, + "objective/non_score_reward": -1.30982506275177, + "objective/rlhf_reward": -2.8393002212047573, + "objective/scores": 0.6, + "policy/approxkl_avg": 130.48788452148438, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.508557915687561, + "step": 1413, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990100860595703 + }, + { + "episode": 22640, + "epoch": 0.4069453931049358, + "loss/policy_avg": -0.11118362098932266, + "lr": 2.728911042944785e-06, + "objective/entropy": 36.70410919189453, + "objective/kl": 12.045846939086914, + "objective/non_score_reward": -1.2045847177505493, + "objective/rlhf_reward": -6.818338871002197, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.107789993286133, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.723143458366394, + "step": 1414, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0014891624450684 + }, + { + "episode": 22656, + "epoch": 0.40723298702232447, + "loss/policy_avg": 0.1664956957101822, + "lr": 2.7287193251533743e-06, + "objective/entropy": 195.51280212402344, + "objective/kl": 16.488208770751953, + "objective/non_score_reward": -1.6488208770751953, + "objective/rlhf_reward": -8.595283508300781, + "objective/scores": -0.5, + "policy/approxkl_avg": 62.559993743896484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9423944354057312, + "step": 1415, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9988129138946533 + }, + { + "episode": 22672, + "epoch": 0.4075205809397131, + "loss/policy_avg": 0.18345528841018677, + "lr": 2.728527607361963e-06, + "objective/entropy": -121.7177734375, + "objective/kl": 11.121731758117676, + "objective/non_score_reward": -1.1121731996536255, + "objective/rlhf_reward": -0.04869285821914637, + "objective/scores": 1.1, + "policy/approxkl_avg": 20.33164405822754, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6241345405578613, + "step": 1416, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980192184448242 + }, + { + "episode": 22688, + "epoch": 0.40780817485710175, + "loss/policy_avg": -0.056718356907367706, + "lr": 2.7283358895705523e-06, + "objective/entropy": 34.303070068359375, + "objective/kl": 12.5802583694458, + "objective/non_score_reward": -1.258025884628296, + "objective/rlhf_reward": -2.6321037769317623, + "objective/scores": 0.6, + "policy/approxkl_avg": 6.837141036987305, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.74076908826828, + "step": 1417, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.003502368927002 + }, + { + "episode": 22704, + "epoch": 0.40809576877449044, + "loss/policy_avg": 0.34876778721809387, + "lr": 2.728144171779141e-06, + "objective/entropy": 141.2511749267578, + "objective/kl": 16.881202697753906, + "objective/non_score_reward": -1.6881201267242432, + "objective/rlhf_reward": -4.927651996883462, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 30.830556869506836, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5224217176437378, + "step": 1418, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000737190246582 + }, + { + "episode": 22720, + "epoch": 0.4083833626918791, + "loss/policy_avg": 0.1370847225189209, + "lr": 2.72795245398773e-06, + "objective/entropy": 131.04425048828125, + "objective/kl": 12.592981338500977, + "objective/non_score_reward": -1.259298324584961, + "objective/rlhf_reward": -7.037193298339844, + "objective/scores": -0.5, + "policy/approxkl_avg": 79.68668365478516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7526917457580566, + "step": 1419, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9970409870147705 + }, + { + "episode": 22736, + "epoch": 0.4086709566092677, + "loss/policy_avg": 0.7799844741821289, + "lr": 2.727760736196319e-06, + "objective/entropy": 110.47119140625, + "objective/kl": 18.634571075439453, + "objective/non_score_reward": -1.8634570837020874, + "objective/rlhf_reward": -9.453828811645508, + "objective/scores": -0.5, + "policy/approxkl_avg": 91.78942108154297, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5299524068832397, + "step": 1420, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996536135673523 + }, + { + "episode": 22752, + "epoch": 0.40895855052665636, + "loss/policy_avg": 0.27093034982681274, + "lr": 2.727569018404908e-06, + "objective/entropy": 98.92936706542969, + "objective/kl": 6.821386814117432, + "objective/non_score_reward": -0.6821386814117432, + "objective/rlhf_reward": -0.3285546362400056, + "objective/scores": 0.6, + "policy/approxkl_avg": 32.1151123046875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5731009244918823, + "step": 1421, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9969843626022339 + }, + { + "episode": 22768, + "epoch": 0.409246144444045, + "loss/policy_avg": 0.10417535901069641, + "lr": 2.727377300613497e-06, + "objective/entropy": -63.33757781982422, + "objective/kl": 14.846542358398438, + "objective/non_score_reward": -1.484654188156128, + "objective/rlhf_reward": -7.9386162757873535, + "objective/scores": -0.5, + "policy/approxkl_avg": 85.31866455078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5758928060531616, + "step": 1422, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9978985786437988 + }, + { + "episode": 22784, + "epoch": 0.40953373836143364, + "loss/policy_avg": 0.028347529470920563, + "lr": 2.727185582822086e-06, + "objective/entropy": 2.5661659240722656, + "objective/kl": 21.124889373779297, + "objective/non_score_reward": -2.1124889850616455, + "objective/rlhf_reward": -4.049956119060516, + "objective/scores": 1.1, + "policy/approxkl_avg": 97.22557067871094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8526643514633179, + "step": 1423, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998213768005371 + }, + { + "episode": 22800, + "epoch": 0.4098213322788223, + "loss/policy_avg": -0.12718230485916138, + "lr": 2.726993865030675e-06, + "objective/entropy": 271.2698059082031, + "objective/kl": 15.589737892150879, + "objective/non_score_reward": -1.558973789215088, + "objective/rlhf_reward": -8.235895156860352, + "objective/scores": -0.5, + "policy/approxkl_avg": 34.46259307861328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7287672758102417, + "step": 1424, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00382661819458 + }, + { + "episode": 22816, + "epoch": 0.410108926196211, + "loss/policy_avg": 0.017169015482068062, + "lr": 2.726802147239264e-06, + "objective/entropy": 83.00675201416016, + "objective/kl": 19.48290252685547, + "objective/non_score_reward": -1.9482901096343994, + "objective/rlhf_reward": -3.393160572648048, + "objective/scores": 1.1, + "policy/approxkl_avg": 7.972476959228516, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5477663278579712, + "step": 1425, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99813711643219 + }, + { + "episode": 22832, + "epoch": 0.4103965201135996, + "loss/policy_avg": 0.08301222324371338, + "lr": 2.726610429447853e-06, + "objective/entropy": 29.15322494506836, + "objective/kl": 16.26679801940918, + "objective/non_score_reward": -1.62667977809906, + "objective/rlhf_reward": -8.506719589233398, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.79837799072266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6539818048477173, + "step": 1426, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0011935234069824 + }, + { + "episode": 22848, + "epoch": 0.41068411403098826, + "loss/policy_avg": 0.18329089879989624, + "lr": 2.7264187116564417e-06, + "objective/entropy": -10.258651733398438, + "objective/kl": 12.080020904541016, + "objective/non_score_reward": -1.2080020904541016, + "objective/rlhf_reward": -3.0071794643727054, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 25.368099212646484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.47818294167518616, + "step": 1427, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991447925567627 + }, + { + "episode": 22864, + "epoch": 0.4109717079483769, + "loss/policy_avg": 1.2192856073379517, + "lr": 2.726226993865031e-06, + "objective/entropy": 100.44667053222656, + "objective/kl": 11.624716758728027, + "objective/non_score_reward": -1.1624715328216553, + "objective/rlhf_reward": -4.249886131286621, + "objective/scores": 0.1, + "policy/approxkl_avg": 85.25991821289062, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7524755001068115, + "step": 1428, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0001072883605957 + }, + { + "episode": 22880, + "epoch": 0.41125930186576554, + "loss/policy_avg": 0.07868831604719162, + "lr": 2.7260352760736197e-06, + "objective/entropy": -73.34170532226562, + "objective/kl": 16.806835174560547, + "objective/non_score_reward": -1.6806833744049072, + "objective/rlhf_reward": -2.3227337360382077, + "objective/scores": 1.1, + "policy/approxkl_avg": 62.879486083984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5852820873260498, + "step": 1429, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9978783130645752 + }, + { + "episode": 22896, + "epoch": 0.4115468957831542, + "loss/policy_avg": 0.5681151151657104, + "lr": 2.7258435582822086e-06, + "objective/entropy": 109.1818618774414, + "objective/kl": 12.72823429107666, + "objective/non_score_reward": -1.2728233337402344, + "objective/rlhf_reward": -0.6912934094667431, + "objective/scores": 1.1, + "policy/approxkl_avg": 60.83876037597656, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.51080322265625, + "step": 1430, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997774600982666 + }, + { + "episode": 22912, + "epoch": 0.4118344897005428, + "loss/policy_avg": 0.3500422239303589, + "lr": 2.725651840490798e-06, + "objective/entropy": 66.81685638427734, + "objective/kl": 14.291481971740723, + "objective/non_score_reward": -1.4291484355926514, + "objective/rlhf_reward": -7.7165937423706055, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.064986228942871, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7346335649490356, + "step": 1431, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0009124279022217 + }, + { + "episode": 22928, + "epoch": 0.41212208361793146, + "loss/policy_avg": 0.2891772985458374, + "lr": 2.7254601226993866e-06, + "objective/entropy": 113.6800537109375, + "objective/kl": 19.13040542602539, + "objective/non_score_reward": -1.9130408763885498, + "objective/rlhf_reward": -9.6521635055542, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.885581970214844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5598267316818237, + "step": 1432, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999432563781738 + }, + { + "episode": 22944, + "epoch": 0.41240967753532015, + "loss/policy_avg": 0.08044654130935669, + "lr": 2.725268404907976e-06, + "objective/entropy": 64.72499084472656, + "objective/kl": 20.169479370117188, + "objective/non_score_reward": -2.0169477462768555, + "objective/rlhf_reward": -7.667791402339935, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.26875114440918, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5961747169494629, + "step": 1433, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996532678604126 + }, + { + "episode": 22960, + "epoch": 0.4126972714527088, + "loss/policy_avg": 0.3121386170387268, + "lr": 2.7250766871165642e-06, + "objective/entropy": -75.4444580078125, + "objective/kl": 23.821794509887695, + "objective/non_score_reward": -2.3821797370910645, + "objective/rlhf_reward": -5.128718531131744, + "objective/scores": 1.1, + "policy/approxkl_avg": 27.88954734802246, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.77299964427948, + "step": 1434, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982830286026 + }, + { + "episode": 22976, + "epoch": 0.41298486537009743, + "loss/policy_avg": 0.24237871170043945, + "lr": 2.7248849693251535e-06, + "objective/entropy": -32.58727264404297, + "objective/kl": 14.388275146484375, + "objective/non_score_reward": -1.4388275146484375, + "objective/rlhf_reward": -5.355309879779815, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.626853942871094, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8990691900253296, + "step": 1435, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984760284423828 + }, + { + "episode": 22992, + "epoch": 0.41327245928748607, + "loss/policy_avg": 0.24794508516788483, + "lr": 2.7246932515337423e-06, + "objective/entropy": 143.39877319335938, + "objective/kl": 16.412979125976562, + "objective/non_score_reward": -1.6412980556488037, + "objective/rlhf_reward": -6.16519216299057, + "objective/scores": 0.1, + "policy/approxkl_avg": 54.83576202392578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6131792068481445, + "step": 1436, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999814510345459 + }, + { + "episode": 23008, + "epoch": 0.4135600532048747, + "loss/policy_avg": 0.19519871473312378, + "lr": 2.724501533742331e-06, + "objective/entropy": -59.90544891357422, + "objective/kl": 17.853824615478516, + "objective/non_score_reward": -1.7853825092315674, + "objective/rlhf_reward": -5.4796707682019345, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 17.840316772460938, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4706451892852783, + "step": 1437, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002455711364746 + }, + { + "episode": 23024, + "epoch": 0.41384764712226335, + "loss/policy_avg": 1.0160012245178223, + "lr": 2.7243098159509203e-06, + "objective/entropy": 211.19354248046875, + "objective/kl": 17.34637451171875, + "objective/non_score_reward": -1.7346374988555908, + "objective/rlhf_reward": -8.93855094909668, + "objective/scores": -0.5, + "policy/approxkl_avg": 252.24220275878906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7842428684234619, + "step": 1438, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9965323209762573 + }, + { + "episode": 23040, + "epoch": 0.414135241039652, + "loss/policy_avg": 0.31407612562179565, + "lr": 2.724118098159509e-06, + "objective/entropy": 184.55502319335938, + "objective/kl": 18.949657440185547, + "objective/non_score_reward": -1.8949657678604126, + "objective/rlhf_reward": -7.1798629522323605, + "objective/scores": 0.1, + "policy/approxkl_avg": 145.09152221679688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6857846975326538, + "step": 1439, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9969500303268433 + }, + { + "episode": 23056, + "epoch": 0.41442283495704063, + "loss/policy_avg": 0.11179807037115097, + "lr": 2.7239263803680983e-06, + "objective/entropy": -39.264190673828125, + "objective/kl": 12.129672050476074, + "objective/non_score_reward": -1.2129671573638916, + "objective/rlhf_reward": -2.904457728342946, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 18.177127838134766, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.660858154296875, + "step": 1440, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9972420930862427 + }, + { + "episode": 23072, + "epoch": 0.4147104288744293, + "loss/policy_avg": -0.29350343346595764, + "lr": 2.723734662576687e-06, + "objective/entropy": 143.82705688476562, + "objective/kl": 8.183101654052734, + "objective/non_score_reward": -0.8183101415634155, + "objective/rlhf_reward": -0.34952164137479935, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.663167953491211, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6625618934631348, + "step": 1441, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.023545742034912 + }, + { + "episode": 23088, + "epoch": 0.41499802279181797, + "loss/policy_avg": 0.12284829467535019, + "lr": 2.723542944785276e-06, + "objective/entropy": 57.83909225463867, + "objective/kl": 11.188475608825684, + "objective/non_score_reward": -1.1188476085662842, + "objective/rlhf_reward": -4.075390315055847, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.067108154296875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6835089921951294, + "step": 1442, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000497817993164 + }, + { + "episode": 23104, + "epoch": 0.4152856167092066, + "loss/policy_avg": -0.24080030620098114, + "lr": 2.723351226993865e-06, + "objective/entropy": -229.4406280517578, + "objective/kl": 13.096569061279297, + "objective/non_score_reward": -1.309657096862793, + "objective/rlhf_reward": -4.8386283874511715, + "objective/scores": 0.1, + "policy/approxkl_avg": 33.96006774902344, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.798072338104248, + "step": 1443, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 11, + "val/ratio": 2.0064001083374023 + }, + { + "episode": 23120, + "epoch": 0.41557321062659525, + "loss/policy_avg": 0.3806346356868744, + "lr": 2.723159509202454e-06, + "objective/entropy": -28.990707397460938, + "objective/kl": 21.558467864990234, + "objective/non_score_reward": -2.15584659576416, + "objective/rlhf_reward": -5.699667845607969, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 111.77049255371094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5212620496749878, + "step": 1444, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998953104019165 + }, + { + "episode": 23136, + "epoch": 0.4158608045439839, + "loss/policy_avg": 0.2565423250198364, + "lr": 2.722967791411043e-06, + "objective/entropy": 42.08885955810547, + "objective/kl": 15.505270004272461, + "objective/non_score_reward": -1.5505270957946777, + "objective/rlhf_reward": -5.802108591794967, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.5589828491211, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5759496688842773, + "step": 1445, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979947805404663 + }, + { + "episode": 23152, + "epoch": 0.4161483984613725, + "loss/policy_avg": 0.37869054079055786, + "lr": 2.722776073619632e-06, + "objective/entropy": -158.08641052246094, + "objective/kl": 16.30980682373047, + "objective/non_score_reward": -1.6309807300567627, + "objective/rlhf_reward": -6.12392292022705, + "objective/scores": 0.1, + "policy/approxkl_avg": 221.66566467285156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6714792251586914, + "step": 1446, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9963035583496094 + }, + { + "episode": 23168, + "epoch": 0.41643599237876117, + "loss/policy_avg": 0.44854480028152466, + "lr": 2.722584355828221e-06, + "objective/entropy": -37.4086799621582, + "objective/kl": 15.159543991088867, + "objective/non_score_reward": -1.5159544944763184, + "objective/rlhf_reward": -1.6638178586959835, + "objective/scores": 1.1, + "policy/approxkl_avg": 70.89303588867188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6588014364242554, + "step": 1447, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998877763748169 + }, + { + "episode": 23184, + "epoch": 0.41672358629614986, + "loss/policy_avg": 0.4342793822288513, + "lr": 2.72239263803681e-06, + "objective/entropy": 81.91819763183594, + "objective/kl": 11.294848442077637, + "objective/non_score_reward": -1.1294848918914795, + "objective/rlhf_reward": -6.517939567565918, + "objective/scores": -0.5, + "policy/approxkl_avg": 39.474754333496094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7826581597328186, + "step": 1448, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9975943565368652 + }, + { + "episode": 23200, + "epoch": 0.4170111802135385, + "loss/policy_avg": 0.44055402278900146, + "lr": 2.722200920245399e-06, + "objective/entropy": 179.12945556640625, + "objective/kl": 16.172334671020508, + "objective/non_score_reward": -1.6172332763671875, + "objective/rlhf_reward": -3.5452140911829204, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 45.85475540161133, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6948776841163635, + "step": 1449, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994070529937744 + }, + { + "episode": 23216, + "epoch": 0.41729877413092714, + "loss/policy_avg": 0.2199755609035492, + "lr": 2.7220092024539877e-06, + "objective/entropy": -139.42800903320312, + "objective/kl": 12.28195858001709, + "objective/non_score_reward": -1.2281959056854248, + "objective/rlhf_reward": -4.512783309817314, + "objective/scores": 0.1, + "policy/approxkl_avg": 86.36170196533203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5656028389930725, + "step": 1450, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996934413909912 + }, + { + "episode": 23232, + "epoch": 0.4175863680483158, + "loss/policy_avg": 0.104374460875988, + "lr": 2.721817484662577e-06, + "objective/entropy": 19.990764617919922, + "objective/kl": 18.923419952392578, + "objective/non_score_reward": -1.8923419713974, + "objective/rlhf_reward": -9.569368362426758, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.039336204528809, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5476473569869995, + "step": 1451, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9971189498901367 + }, + { + "episode": 23248, + "epoch": 0.4178739619657044, + "loss/policy_avg": 0.32264718413352966, + "lr": 2.7216257668711658e-06, + "objective/entropy": 41.33949279785156, + "objective/kl": 14.19123649597168, + "objective/non_score_reward": -1.4191235303878784, + "objective/rlhf_reward": -7.676494121551514, + "objective/scores": -0.5, + "policy/approxkl_avg": 64.27468872070312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4869697690010071, + "step": 1452, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996262788772583 + }, + { + "episode": 23264, + "epoch": 0.41816155588309306, + "loss/policy_avg": 0.1430419385433197, + "lr": 2.721434049079755e-06, + "objective/entropy": 135.11158752441406, + "objective/kl": 16.908254623413086, + "objective/non_score_reward": -1.6908254623413086, + "objective/rlhf_reward": -6.363301908969879, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.750733375549316, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6986678242683411, + "step": 1453, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003662109375 + }, + { + "episode": 23280, + "epoch": 0.4184491498004817, + "loss/policy_avg": 0.05973121523857117, + "lr": 2.721242331288344e-06, + "objective/entropy": 148.2733917236328, + "objective/kl": 18.87240219116211, + "objective/non_score_reward": -1.8872401714324951, + "objective/rlhf_reward": -4.625241969467375, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 25.233966827392578, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.545336902141571, + "step": 1454, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000401258468628 + }, + { + "episode": 23296, + "epoch": 0.41873674371787034, + "loss/policy_avg": -0.26827386021614075, + "lr": 2.7210506134969326e-06, + "objective/entropy": -35.343963623046875, + "objective/kl": 12.835941314697266, + "objective/non_score_reward": -1.2835942506790161, + "objective/rlhf_reward": -0.7343767940998074, + "objective/scores": 1.1, + "policy/approxkl_avg": 27.88837432861328, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.719089150428772, + "step": 1455, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0176620483398438 + }, + { + "episode": 23312, + "epoch": 0.41902433763525904, + "loss/policy_avg": 0.4007697105407715, + "lr": 2.7208588957055214e-06, + "objective/entropy": 38.718055725097656, + "objective/kl": 16.693111419677734, + "objective/non_score_reward": -1.669311285018921, + "objective/rlhf_reward": -3.75352620029566, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 90.98106384277344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7188234925270081, + "step": 1456, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997847080230713 + }, + { + "episode": 23328, + "epoch": 0.4193119315526477, + "loss/policy_avg": 0.2741047441959381, + "lr": 2.7206671779141102e-06, + "objective/entropy": 131.7677459716797, + "objective/kl": 13.466203689575195, + "objective/non_score_reward": -1.3466205596923828, + "objective/rlhf_reward": -3.2637757084527355, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 6.06925630569458, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5739939212799072, + "step": 1457, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990949630737305 + }, + { + "episode": 23344, + "epoch": 0.4195995254700363, + "loss/policy_avg": -0.20607559382915497, + "lr": 2.7204754601226995e-06, + "objective/entropy": 140.73605346679688, + "objective/kl": 12.64559555053711, + "objective/non_score_reward": -1.2645595073699951, + "objective/rlhf_reward": -3.4541179276147655, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 25.566354751586914, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7669584155082703, + "step": 1458, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.010098934173584 + }, + { + "episode": 23360, + "epoch": 0.41988711938742496, + "loss/policy_avg": 0.08116651326417923, + "lr": 2.7202837423312883e-06, + "objective/entropy": -56.271209716796875, + "objective/kl": 11.172001838684082, + "objective/non_score_reward": -1.1172001361846924, + "objective/rlhf_reward": -4.068800783157348, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.867913246154785, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7661799192428589, + "step": 1459, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.99923837184906 + }, + { + "episode": 23376, + "epoch": 0.4201747133048136, + "loss/policy_avg": 0.1444312185049057, + "lr": 2.720092024539877e-06, + "objective/entropy": 27.851055145263672, + "objective/kl": 14.869014739990234, + "objective/non_score_reward": -1.4869015216827393, + "objective/rlhf_reward": -1.5476059079170223, + "objective/scores": 1.1, + "policy/approxkl_avg": 118.09273529052734, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6213133335113525, + "step": 1460, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0028345584869385 + }, + { + "episode": 23392, + "epoch": 0.42046230722220224, + "loss/policy_avg": 0.24942255020141602, + "lr": 2.7199003067484663e-06, + "objective/entropy": -101.20877838134766, + "objective/kl": 15.96616268157959, + "objective/non_score_reward": -1.5966161489486694, + "objective/rlhf_reward": -8.386465072631836, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.288822174072266, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7658300399780273, + "step": 1461, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989122152328491 + }, + { + "episode": 23408, + "epoch": 0.4207499011395909, + "loss/policy_avg": -0.02906278893351555, + "lr": 2.719708588957055e-06, + "objective/entropy": 89.44154357910156, + "objective/kl": 11.966876983642578, + "objective/non_score_reward": -1.1966878175735474, + "objective/rlhf_reward": -4.38675103187561, + "objective/scores": 0.1, + "policy/approxkl_avg": 33.13001251220703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5389198660850525, + "step": 1462, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0009779930114746 + }, + { + "episode": 23424, + "epoch": 0.42103749505697957, + "loss/policy_avg": 0.4894261062145233, + "lr": 2.7195168711656444e-06, + "objective/entropy": -4.174308776855469, + "objective/kl": 11.774065971374512, + "objective/non_score_reward": -1.1774065494537354, + "objective/rlhf_reward": -0.3096263170242306, + "objective/scores": 1.1, + "policy/approxkl_avg": 32.79606628417969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5737270712852478, + "step": 1463, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989784955978394 + }, + { + "episode": 23440, + "epoch": 0.4213250889743682, + "loss/policy_avg": 0.28955715894699097, + "lr": 2.719325153374233e-06, + "objective/entropy": -63.83994674682617, + "objective/kl": 5.191441059112549, + "objective/non_score_reward": -0.5191440582275391, + "objective/rlhf_reward": -1.676576367020607, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.2088193893432617, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4842286705970764, + "step": 1464, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986271858215332 + }, + { + "episode": 23456, + "epoch": 0.42161268289175685, + "loss/policy_avg": 0.3400211036205292, + "lr": 2.719133435582822e-06, + "objective/entropy": -41.42976379394531, + "objective/kl": 14.391012191772461, + "objective/non_score_reward": -1.439101219177246, + "objective/rlhf_reward": -7.756404876708984, + "objective/scores": -0.5, + "policy/approxkl_avg": 39.38445281982422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5957543849945068, + "step": 1465, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980661869049072 + }, + { + "episode": 23472, + "epoch": 0.4219002768091455, + "loss/policy_avg": 0.30436578392982483, + "lr": 2.718941717791411e-06, + "objective/entropy": 191.9390869140625, + "objective/kl": 12.309860229492188, + "objective/non_score_reward": -1.2309861183166504, + "objective/rlhf_reward": -6.923944473266602, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.813614845275879, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6054030656814575, + "step": 1466, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998063325881958 + }, + { + "episode": 23488, + "epoch": 0.42218787072653413, + "loss/policy_avg": -0.024539276957511902, + "lr": 2.71875e-06, + "objective/entropy": 69.8360366821289, + "objective/kl": 10.700786590576172, + "objective/non_score_reward": -1.0700786113739014, + "objective/rlhf_reward": 0.11968519687652623, + "objective/scores": 1.1, + "policy/approxkl_avg": 58.47478103637695, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.46354907751083374, + "step": 1467, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0008950233459473 + }, + { + "episode": 23504, + "epoch": 0.42247546464392277, + "loss/policy_avg": -0.04982258379459381, + "lr": 2.7185582822085892e-06, + "objective/entropy": 105.5312271118164, + "objective/kl": 11.204243659973145, + "objective/non_score_reward": -1.1204243898391724, + "objective/rlhf_reward": -4.081697633862495, + "objective/scores": 0.1, + "policy/approxkl_avg": 56.498252868652344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7991576790809631, + "step": 1468, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999009370803833 + }, + { + "episode": 23520, + "epoch": 0.4227630585613114, + "loss/policy_avg": 0.1773151159286499, + "lr": 2.718366564417178e-06, + "objective/entropy": -217.1636199951172, + "objective/kl": 11.214568138122559, + "objective/non_score_reward": -1.1214568614959717, + "objective/rlhf_reward": -6.485827445983887, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.7668520212173462, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6018310189247131, + "step": 1469, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0021896362304688 + }, + { + "episode": 23536, + "epoch": 0.42305065247870005, + "loss/policy_avg": 0.2942320704460144, + "lr": 2.718174846625767e-06, + "objective/entropy": 195.77435302734375, + "objective/kl": 12.295269966125488, + "objective/non_score_reward": -1.2295269966125488, + "objective/rlhf_reward": -4.518107733130455, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.731227874755859, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6320974826812744, + "step": 1470, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986553192138672 + }, + { + "episode": 23552, + "epoch": 0.42333824639608875, + "loss/policy_avg": 0.5762624740600586, + "lr": 2.717983128834356e-06, + "objective/entropy": 11.472053527832031, + "objective/kl": 15.345174789428711, + "objective/non_score_reward": -1.5345174074172974, + "objective/rlhf_reward": -5.738069570064544, + "objective/scores": 0.1, + "policy/approxkl_avg": 76.61418151855469, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.8648232817649841, + "step": 1471, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0015358924865723 + }, + { + "episode": 23568, + "epoch": 0.4236258403134774, + "loss/policy_avg": 0.24686360359191895, + "lr": 2.717791411042945e-06, + "objective/entropy": -29.269935607910156, + "objective/kl": 18.786380767822266, + "objective/non_score_reward": -1.8786380290985107, + "objective/rlhf_reward": -9.514552116394043, + "objective/scores": -0.5, + "policy/approxkl_avg": 123.64360046386719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8638693690299988, + "step": 1472, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996600866317749 + }, + { + "episode": 23584, + "epoch": 0.423913434230866, + "loss/policy_avg": 0.6834498643875122, + "lr": 2.7175996932515337e-06, + "objective/entropy": 117.84262084960938, + "objective/kl": 9.692749977111816, + "objective/non_score_reward": -0.9692749977111816, + "objective/rlhf_reward": -5.877099990844727, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.469287872314453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4936543405056, + "step": 1473, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992587566375732 + }, + { + "episode": 23600, + "epoch": 0.42420102814825467, + "loss/policy_avg": 0.2896210253238678, + "lr": 2.717407975460123e-06, + "objective/entropy": -165.11666870117188, + "objective/kl": 15.991458892822266, + "objective/non_score_reward": -1.5991458892822266, + "objective/rlhf_reward": -1.9965835869312283, + "objective/scores": 1.1, + "policy/approxkl_avg": 32.42799377441406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5854696035385132, + "step": 1474, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000058650970459 + }, + { + "episode": 23616, + "epoch": 0.4244886220656433, + "loss/policy_avg": 0.5737141370773315, + "lr": 2.7172162576687118e-06, + "objective/entropy": 166.11766052246094, + "objective/kl": 23.459354400634766, + "objective/non_score_reward": -2.345935106277466, + "objective/rlhf_reward": -11.383740425109863, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.438879013061523, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6819442510604858, + "step": 1475, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9964954853057861 + }, + { + "episode": 23632, + "epoch": 0.42477621598303195, + "loss/policy_avg": 0.355984628200531, + "lr": 2.717024539877301e-06, + "objective/entropy": 10.719039916992188, + "objective/kl": 13.877435684204102, + "objective/non_score_reward": -1.3877434730529785, + "objective/rlhf_reward": -1.1509741157293316, + "objective/scores": 1.1, + "policy/approxkl_avg": 46.079010009765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.697611391544342, + "step": 1476, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987831115722656 + }, + { + "episode": 23648, + "epoch": 0.4250638099004206, + "loss/policy_avg": 0.3038805425167084, + "lr": 2.71683282208589e-06, + "objective/entropy": -127.39871978759766, + "objective/kl": 10.339272499084473, + "objective/non_score_reward": -1.0339272022247314, + "objective/rlhf_reward": -6.135708808898926, + "objective/scores": -0.5, + "policy/approxkl_avg": 48.258731842041016, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7669675350189209, + "step": 1477, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979398250579834 + }, + { + "episode": 23664, + "epoch": 0.4253514038178092, + "loss/policy_avg": 3.8968005180358887, + "lr": 2.7166411042944786e-06, + "objective/entropy": -48.2403564453125, + "objective/kl": 18.57742691040039, + "objective/non_score_reward": -1.8577427864074707, + "objective/rlhf_reward": -3.0309712052345272, + "objective/scores": 1.1, + "policy/approxkl_avg": 22.280670166015625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7676411271095276, + "step": 1478, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9987635612487793 + }, + { + "episode": 23680, + "epoch": 0.4256389977351979, + "loss/policy_avg": -0.27429449558258057, + "lr": 2.7164493865030674e-06, + "objective/entropy": -148.14480590820312, + "objective/kl": 12.99055290222168, + "objective/non_score_reward": -1.2990553379058838, + "objective/rlhf_reward": -4.796221590042114, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.844132423400879, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.795985758304596, + "step": 1479, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.00437593460083 + }, + { + "episode": 23696, + "epoch": 0.42592659165258656, + "loss/policy_avg": 0.10123275220394135, + "lr": 2.7162576687116562e-06, + "objective/entropy": -3.9328994750976562, + "objective/kl": 10.527267456054688, + "objective/non_score_reward": -1.0527267456054688, + "objective/rlhf_reward": -6.210907459259033, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.531219482421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6534985303878784, + "step": 1480, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000793933868408 + }, + { + "episode": 23712, + "epoch": 0.4262141855699752, + "loss/policy_avg": 0.8577756881713867, + "lr": 2.7160659509202455e-06, + "objective/entropy": -65.08586883544922, + "objective/kl": 21.022594451904297, + "objective/non_score_reward": -2.102259635925293, + "objective/rlhf_reward": -10.409038543701172, + "objective/scores": -0.5, + "policy/approxkl_avg": 177.66314697265625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7775266170501709, + "step": 1481, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983553886413574 + }, + { + "episode": 23728, + "epoch": 0.42650177948736384, + "loss/policy_avg": 0.17644314467906952, + "lr": 2.7158742331288343e-06, + "objective/entropy": -16.106605529785156, + "objective/kl": 14.382020950317383, + "objective/non_score_reward": -1.438201904296875, + "objective/rlhf_reward": -4.019474522272746, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 14.692214012145996, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6867858171463013, + "step": 1482, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9983866214752197 + }, + { + "episode": 23744, + "epoch": 0.4267893734047525, + "loss/policy_avg": -0.17333167791366577, + "lr": 2.715682515337423e-06, + "objective/entropy": -296.7401123046875, + "objective/kl": 10.11327075958252, + "objective/non_score_reward": -1.0113270282745361, + "objective/rlhf_reward": -1.1215893223297326, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 16.041187286376953, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7763004899024963, + "step": 1483, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0011274814605713 + }, + { + "episode": 23760, + "epoch": 0.4270769673221411, + "loss/policy_avg": 0.5083790421485901, + "lr": 2.7154907975460123e-06, + "objective/entropy": 127.42313385009766, + "objective/kl": 15.439505577087402, + "objective/non_score_reward": -1.5439507961273193, + "objective/rlhf_reward": -1.7758030056953427, + "objective/scores": 1.1, + "policy/approxkl_avg": 22.564075469970703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6805832386016846, + "step": 1484, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985697269439697 + }, + { + "episode": 23776, + "epoch": 0.42736456123952976, + "loss/policy_avg": 0.6594638228416443, + "lr": 2.715299079754601e-06, + "objective/entropy": 0.7167205810546875, + "objective/kl": 13.968058586120605, + "objective/non_score_reward": -1.396805763244629, + "objective/rlhf_reward": -3.8538898686567937, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 29.08931541442871, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6674610376358032, + "step": 1485, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992890357971191 + }, + { + "episode": 23792, + "epoch": 0.42765215515691846, + "loss/policy_avg": 0.20646658539772034, + "lr": 2.7151073619631904e-06, + "objective/entropy": -69.60407257080078, + "objective/kl": 10.60425090789795, + "objective/non_score_reward": -1.0604252815246582, + "objective/rlhf_reward": -3.8417008280754086, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.102411270141602, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6721906661987305, + "step": 1486, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982298612594604 + }, + { + "episode": 23808, + "epoch": 0.4279397490743071, + "loss/policy_avg": 0.15155388414859772, + "lr": 2.714915644171779e-06, + "objective/entropy": 194.2589874267578, + "objective/kl": 16.754682540893555, + "objective/non_score_reward": -1.6754682064056396, + "objective/rlhf_reward": -6.301872944831848, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.726747512817383, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6672933101654053, + "step": 1487, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999563694000244 + }, + { + "episode": 23824, + "epoch": 0.42822734299169574, + "loss/policy_avg": 0.7662161588668823, + "lr": 2.714723926380368e-06, + "objective/entropy": -74.04181671142578, + "objective/kl": 18.99214744567871, + "objective/non_score_reward": -1.899214744567871, + "objective/rlhf_reward": -9.596858978271484, + "objective/scores": -0.5, + "policy/approxkl_avg": 98.49945831298828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6449931263923645, + "step": 1488, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000901937484741 + }, + { + "episode": 23840, + "epoch": 0.4285149369090844, + "loss/policy_avg": 0.04193422198295593, + "lr": 2.7145322085889572e-06, + "objective/entropy": 109.29447937011719, + "objective/kl": 14.153707504272461, + "objective/non_score_reward": -1.4153707027435303, + "objective/rlhf_reward": -5.2614830493927, + "objective/scores": 0.1, + "policy/approxkl_avg": 0.5321764945983887, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7183086276054382, + "step": 1489, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0033793449401855 + }, + { + "episode": 23856, + "epoch": 0.428802530826473, + "loss/policy_avg": 0.1599797010421753, + "lr": 2.714340490797546e-06, + "objective/entropy": 304.7250671386719, + "objective/kl": 16.353168487548828, + "objective/non_score_reward": -1.6353168487548828, + "objective/rlhf_reward": -4.593856344895299, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 11.707456588745117, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 1.0101954936981201, + "step": 1490, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995477199554443 + }, + { + "episode": 23872, + "epoch": 0.42909012474386166, + "loss/policy_avg": 0.4436057209968567, + "lr": 2.7141487730061353e-06, + "objective/entropy": 181.70814514160156, + "objective/kl": 10.047649383544922, + "objective/non_score_reward": -1.0047650337219238, + "objective/rlhf_reward": 0.3809399843215946, + "objective/scores": 1.1, + "policy/approxkl_avg": 38.075225830078125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6739720106124878, + "step": 1491, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0002894401550293 + }, + { + "episode": 23888, + "epoch": 0.4293777186612503, + "loss/policy_avg": 0.15693873167037964, + "lr": 2.713957055214724e-06, + "objective/entropy": 226.33489990234375, + "objective/kl": 11.626256942749023, + "objective/non_score_reward": -1.162625789642334, + "objective/rlhf_reward": -6.650503158569336, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.06602096557617, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5729613900184631, + "step": 1492, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001974105834961 + }, + { + "episode": 23904, + "epoch": 0.42966531257863894, + "loss/policy_avg": 3.0012454986572266, + "lr": 2.713765337423313e-06, + "objective/entropy": 87.70625305175781, + "objective/kl": 11.87493896484375, + "objective/non_score_reward": -1.1874938011169434, + "objective/rlhf_reward": -6.749975204467773, + "objective/scores": -0.5, + "policy/approxkl_avg": 16.785877227783203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7042635679244995, + "step": 1493, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0035343170166016 + }, + { + "episode": 23920, + "epoch": 0.42995290649602763, + "loss/policy_avg": 0.29552215337753296, + "lr": 2.713573619631902e-06, + "objective/entropy": 100.45034790039062, + "objective/kl": 15.548515319824219, + "objective/non_score_reward": -1.5548515319824219, + "objective/rlhf_reward": -5.819406008720398, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.74082565307617, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7816206812858582, + "step": 1494, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000070810317993 + }, + { + "episode": 23936, + "epoch": 0.43024050041341627, + "loss/policy_avg": 0.138979971408844, + "lr": 2.713381901840491e-06, + "objective/entropy": 2.042755126953125, + "objective/kl": 16.496444702148438, + "objective/non_score_reward": -1.6496446132659912, + "objective/rlhf_reward": -8.598578453063965, + "objective/scores": -0.5, + "policy/approxkl_avg": 104.73353576660156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.853604793548584, + "step": 1495, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9992599487304688 + }, + { + "episode": 23952, + "epoch": 0.4305280943308049, + "loss/policy_avg": -0.35569775104522705, + "lr": 2.7131901840490797e-06, + "objective/entropy": 47.663047790527344, + "objective/kl": 15.499086380004883, + "objective/non_score_reward": -1.5499086380004883, + "objective/rlhf_reward": -8.199634552001953, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.7884521484375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5710984468460083, + "step": 1496, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003573417663574 + }, + { + "episode": 23968, + "epoch": 0.43081568824819355, + "loss/policy_avg": 0.4634855389595032, + "lr": 2.712998466257669e-06, + "objective/entropy": 186.1064453125, + "objective/kl": 13.683874130249023, + "objective/non_score_reward": -1.3683874607086182, + "objective/rlhf_reward": -5.073549798130989, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.9293155670166, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6978639364242554, + "step": 1497, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984076023101807 + }, + { + "episode": 23984, + "epoch": 0.4311032821655822, + "loss/policy_avg": 0.36279696226119995, + "lr": 2.7128067484662578e-06, + "objective/entropy": -4.6014556884765625, + "objective/kl": 12.281639099121094, + "objective/non_score_reward": -1.2281639575958252, + "objective/rlhf_reward": -0.5126561433076855, + "objective/scores": 1.1, + "policy/approxkl_avg": 24.407772064208984, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5443528294563293, + "step": 1498, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997795820236206 + }, + { + "episode": 24000, + "epoch": 0.43139087608297083, + "loss/policy_avg": 0.6602043509483337, + "lr": 2.712615030674847e-06, + "objective/entropy": -24.860595703125, + "objective/kl": 12.323899269104004, + "objective/non_score_reward": -1.2323899269104004, + "objective/rlhf_reward": -6.929559707641602, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.667274475097656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7064457535743713, + "step": 1499, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.004753589630127 + }, + { + "episode": 24016, + "epoch": 0.43167847000035947, + "loss/policy_avg": 0.25143861770629883, + "lr": 2.712423312883436e-06, + "objective/entropy": 293.12713623046875, + "objective/kl": 12.063253402709961, + "objective/non_score_reward": -1.2063254117965698, + "objective/rlhf_reward": -4.425301736593246, + "objective/scores": 0.1, + "policy/approxkl_avg": 60.569305419921875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7328671813011169, + "step": 1500, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970157146453857 + }, + { + "episode": 24032, + "epoch": 0.43196606391774817, + "loss/policy_avg": 0.2839478850364685, + "lr": 2.7122315950920246e-06, + "objective/entropy": 110.01084899902344, + "objective/kl": 13.495948791503906, + "objective/non_score_reward": -1.3495948314666748, + "objective/rlhf_reward": -0.9983793929219242, + "objective/scores": 1.1, + "policy/approxkl_avg": 12.78272819519043, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7883875370025635, + "step": 1501, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0003154277801514 + }, + { + "episode": 24048, + "epoch": 0.4322536578351368, + "loss/policy_avg": 0.20969723165035248, + "lr": 2.7120398773006134e-06, + "objective/entropy": 4.47552490234375, + "objective/kl": 11.441368103027344, + "objective/non_score_reward": -1.144136905670166, + "objective/rlhf_reward": -4.176547622680664, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.5608024597168, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5309766530990601, + "step": 1502, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0012364387512207 + }, + { + "episode": 24064, + "epoch": 0.43254125175252545, + "loss/policy_avg": 0.19796213507652283, + "lr": 2.7118481595092022e-06, + "objective/entropy": 78.621337890625, + "objective/kl": 17.16468048095703, + "objective/non_score_reward": -1.716468095779419, + "objective/rlhf_reward": -8.865872383117676, + "objective/scores": -0.5, + "policy/approxkl_avg": 159.43646240234375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6347424387931824, + "step": 1503, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9969336986541748 + }, + { + "episode": 24080, + "epoch": 0.4328288456699141, + "loss/policy_avg": 0.8738093376159668, + "lr": 2.7116564417177915e-06, + "objective/entropy": 82.19612884521484, + "objective/kl": 28.559629440307617, + "objective/non_score_reward": -2.8559629917144775, + "objective/rlhf_reward": -11.02385220527649, + "objective/scores": 0.1, + "policy/approxkl_avg": 123.68681335449219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8077266216278076, + "step": 1504, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979246854782104 + }, + { + "episode": 24096, + "epoch": 0.4331164395873027, + "loss/policy_avg": -0.05104995146393776, + "lr": 2.7114647239263803e-06, + "objective/entropy": -74.74847412109375, + "objective/kl": 7.449967384338379, + "objective/non_score_reward": -0.7449966669082642, + "objective/rlhf_reward": -2.5799867570400234, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.6437416076660156, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5547415614128113, + "step": 1505, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997807741165161 + }, + { + "episode": 24112, + "epoch": 0.43340403350469137, + "loss/policy_avg": 0.2574435770511627, + "lr": 2.7112730061349695e-06, + "objective/entropy": -191.8342742919922, + "objective/kl": 11.918403625488281, + "objective/non_score_reward": -1.191840410232544, + "objective/rlhf_reward": -4.367361432313919, + "objective/scores": 0.1, + "policy/approxkl_avg": 42.6358528137207, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5687357187271118, + "step": 1506, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998598098754883 + }, + { + "episode": 24128, + "epoch": 0.43369162742208, + "loss/policy_avg": -0.061624474823474884, + "lr": 2.7110812883435583e-06, + "objective/entropy": 209.62362670898438, + "objective/kl": 12.062373161315918, + "objective/non_score_reward": -1.2062373161315918, + "objective/rlhf_reward": -6.824949264526367, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.045953750610352, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5013295412063599, + "step": 1507, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000643730163574 + }, + { + "episode": 24144, + "epoch": 0.43397922133946865, + "loss/policy_avg": -0.24623996019363403, + "lr": 2.710889570552147e-06, + "objective/entropy": -16.262832641601562, + "objective/kl": 18.696651458740234, + "objective/non_score_reward": -1.8696651458740234, + "objective/rlhf_reward": -7.078660643100738, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.216144561767578, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7861824631690979, + "step": 1508, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002674102783203 + }, + { + "episode": 24160, + "epoch": 0.43426681525685734, + "loss/policy_avg": 0.1839936077594757, + "lr": 2.7106978527607364e-06, + "objective/entropy": -86.09889221191406, + "objective/kl": 16.891660690307617, + "objective/non_score_reward": -1.6891660690307617, + "objective/rlhf_reward": -8.756664276123047, + "objective/scores": -0.5, + "policy/approxkl_avg": 95.92230224609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.591369092464447, + "step": 1509, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9982521533966064 + }, + { + "episode": 24176, + "epoch": 0.434554409174246, + "loss/policy_avg": 0.09178348630666733, + "lr": 2.710506134969325e-06, + "objective/entropy": 319.3994445800781, + "objective/kl": 13.149795532226562, + "objective/non_score_reward": -1.3149795532226562, + "objective/rlhf_reward": -4.859918212890625, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.748076438903809, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8005496263504028, + "step": 1510, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979579448699951 + }, + { + "episode": 24192, + "epoch": 0.4348420030916346, + "loss/policy_avg": -0.19410985708236694, + "lr": 2.710314417177914e-06, + "objective/entropy": -26.6497802734375, + "objective/kl": 13.471776962280273, + "objective/non_score_reward": -1.3471777439117432, + "objective/rlhf_reward": -2.4649919017564983, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 31.211530685424805, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6536291241645813, + "step": 1511, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0035223960876465 + }, + { + "episode": 24208, + "epoch": 0.43512959700902326, + "loss/policy_avg": 0.14305329322814941, + "lr": 2.7101226993865032e-06, + "objective/entropy": 41.977569580078125, + "objective/kl": 10.400999069213867, + "objective/non_score_reward": -1.040099859237671, + "objective/rlhf_reward": -3.760399496555328, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.730424880981445, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6075326204299927, + "step": 1512, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0010409355163574 + }, + { + "episode": 24224, + "epoch": 0.4354171909264119, + "loss/policy_avg": 0.19263537228107452, + "lr": 2.709930981595092e-06, + "objective/entropy": 149.12783813476562, + "objective/kl": 14.2966947555542, + "objective/non_score_reward": -1.4296694993972778, + "objective/rlhf_reward": -1.3186778783798214, + "objective/scores": 1.1, + "policy/approxkl_avg": 33.09269714355469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.35392051935195923, + "step": 1513, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0018491744995117 + }, + { + "episode": 24240, + "epoch": 0.43570478484380054, + "loss/policy_avg": -0.43073225021362305, + "lr": 2.7097392638036813e-06, + "objective/entropy": -185.1280517578125, + "objective/kl": 14.71350383758545, + "objective/non_score_reward": -1.4713503122329712, + "objective/rlhf_reward": -3.937990019993718, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 21.07646369934082, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5609403848648071, + "step": 1514, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.015136241912842 + }, + { + "episode": 24256, + "epoch": 0.4359923787611892, + "loss/policy_avg": 0.4406461715698242, + "lr": 2.70954754601227e-06, + "objective/entropy": 164.08778381347656, + "objective/kl": 20.165145874023438, + "objective/non_score_reward": -2.016514539718628, + "objective/rlhf_reward": -10.066058158874512, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.16340255737305, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6640625, + "step": 1515, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9969031810760498 + }, + { + "episode": 24272, + "epoch": 0.4362799726785778, + "loss/policy_avg": 0.09469935297966003, + "lr": 2.709355828220859e-06, + "objective/entropy": -48.509239196777344, + "objective/kl": 20.4273738861084, + "objective/non_score_reward": -2.0427374839782715, + "objective/rlhf_reward": -7.7709495484828945, + "objective/scores": 0.1, + "policy/approxkl_avg": 220.02273559570312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6581880450248718, + "step": 1516, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996927261352539 + }, + { + "episode": 24288, + "epoch": 0.4365675665959665, + "loss/policy_avg": 1.0518330335617065, + "lr": 2.709164110429448e-06, + "objective/entropy": -73.3015365600586, + "objective/kl": 13.697656631469727, + "objective/non_score_reward": -1.3697656393051147, + "objective/rlhf_reward": -5.079062661528587, + "objective/scores": 0.1, + "policy/approxkl_avg": 94.2430191040039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7663182020187378, + "step": 1517, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0014703273773193 + }, + { + "episode": 24304, + "epoch": 0.43685516051335516, + "loss/policy_avg": 0.2166292667388916, + "lr": 2.708972392638037e-06, + "objective/entropy": 80.52146911621094, + "objective/kl": 9.666765213012695, + "objective/non_score_reward": -0.9666764736175537, + "objective/rlhf_reward": -3.466705864667892, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.77708625793457, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8598181009292603, + "step": 1518, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0006589889526367 + }, + { + "episode": 24320, + "epoch": 0.4371427544307438, + "loss/policy_avg": 1.0463180541992188, + "lr": 2.708780674846626e-06, + "objective/entropy": 104.51371765136719, + "objective/kl": 10.208975791931152, + "objective/non_score_reward": -1.020897626876831, + "objective/rlhf_reward": 0.31640946269035375, + "objective/scores": 1.1, + "policy/approxkl_avg": 71.09988403320312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6160992383956909, + "step": 1519, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9966857433319092 + }, + { + "episode": 24336, + "epoch": 0.43743034834813244, + "loss/policy_avg": 0.49003443121910095, + "lr": 2.708588957055215e-06, + "objective/entropy": 98.09468078613281, + "objective/kl": 16.203685760498047, + "objective/non_score_reward": -1.6203685998916626, + "objective/rlhf_reward": -4.925215094295099, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 63.16912841796875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7153348922729492, + "step": 1520, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998132586479187 + }, + { + "episode": 24352, + "epoch": 0.4377179422655211, + "loss/policy_avg": 0.07383514940738678, + "lr": 2.7083972392638038e-06, + "objective/entropy": 91.20425415039062, + "objective/kl": 10.100933074951172, + "objective/non_score_reward": -1.0100932121276855, + "objective/rlhf_reward": -2.43625302976759, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 32.343509674072266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5615959167480469, + "step": 1521, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986927509307861 + }, + { + "episode": 24368, + "epoch": 0.4380055361829097, + "loss/policy_avg": 0.615394115447998, + "lr": 2.708205521472393e-06, + "objective/entropy": 34.11743927001953, + "objective/kl": 13.834301948547363, + "objective/non_score_reward": -1.3834302425384521, + "objective/rlhf_reward": -7.533720970153809, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.0637321472168, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7028712630271912, + "step": 1522, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989628791809082 + }, + { + "episode": 24384, + "epoch": 0.43829313010029836, + "loss/policy_avg": 0.42487022280693054, + "lr": 2.7080138036809814e-06, + "objective/entropy": 291.3263854980469, + "objective/kl": 16.128992080688477, + "objective/non_score_reward": -1.6128990650177002, + "objective/rlhf_reward": -2.0515966624021527, + "objective/scores": 1.1, + "policy/approxkl_avg": 12.953594207763672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7052067518234253, + "step": 1523, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995543956756592 + }, + { + "episode": 24400, + "epoch": 0.43858072401768705, + "loss/policy_avg": 0.9943918585777283, + "lr": 2.7078220858895706e-06, + "objective/entropy": 183.32113647460938, + "objective/kl": 15.212042808532715, + "objective/non_score_reward": -1.5212042331695557, + "objective/rlhf_reward": -1.6848166942596432, + "objective/scores": 1.1, + "policy/approxkl_avg": 233.8773193359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7408942580223083, + "step": 1524, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999128818511963 + }, + { + "episode": 24416, + "epoch": 0.4388683179350757, + "loss/policy_avg": 0.9949908256530762, + "lr": 2.7076303680981594e-06, + "objective/entropy": -187.033203125, + "objective/kl": 17.688919067382812, + "objective/non_score_reward": -1.7688918113708496, + "objective/rlhf_reward": -6.675567245483398, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.05734634399414, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.8625385761260986, + "step": 1525, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9983234405517578 + }, + { + "episode": 24432, + "epoch": 0.43915591185246433, + "loss/policy_avg": 0.8484746813774109, + "lr": 2.7074386503067482e-06, + "objective/entropy": 174.75379943847656, + "objective/kl": 20.135005950927734, + "objective/non_score_reward": -2.013500690460205, + "objective/rlhf_reward": -3.654003000259399, + "objective/scores": 1.1, + "policy/approxkl_avg": 134.73599243164062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.664713442325592, + "step": 1526, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998801946640015 + }, + { + "episode": 24448, + "epoch": 0.43944350576985297, + "loss/policy_avg": 1.8293222188949585, + "lr": 2.7072469325153375e-06, + "objective/entropy": 57.493255615234375, + "objective/kl": 8.005315780639648, + "objective/non_score_reward": -0.8005315661430359, + "objective/rlhf_reward": 1.1978736609220508, + "objective/scores": 1.1, + "policy/approxkl_avg": 17.183082580566406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7343942523002625, + "step": 1527, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00285005569458 + }, + { + "episode": 24464, + "epoch": 0.4397310996872416, + "loss/policy_avg": 0.6055887937545776, + "lr": 2.7070552147239263e-06, + "objective/entropy": 205.1093292236328, + "objective/kl": 15.435603141784668, + "objective/non_score_reward": -1.5435603857040405, + "objective/rlhf_reward": -1.7742413640022274, + "objective/scores": 1.1, + "policy/approxkl_avg": 60.15907287597656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5648759603500366, + "step": 1528, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996998310089111 + }, + { + "episode": 24480, + "epoch": 0.44001869360463025, + "loss/policy_avg": 1.3451311588287354, + "lr": 2.7068634969325155e-06, + "objective/entropy": 190.40625, + "objective/kl": 13.39560317993164, + "objective/non_score_reward": -1.3395602703094482, + "objective/rlhf_reward": -0.9582409471273419, + "objective/scores": 1.1, + "policy/approxkl_avg": 67.11732482910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8093500137329102, + "step": 1529, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993016719818115 + }, + { + "episode": 24496, + "epoch": 0.4403062875220189, + "loss/policy_avg": 0.3937966227531433, + "lr": 2.7066717791411043e-06, + "objective/entropy": 48.01210021972656, + "objective/kl": 15.38280200958252, + "objective/non_score_reward": -1.5382802486419678, + "objective/rlhf_reward": -8.153120994567871, + "objective/scores": -0.5, + "policy/approxkl_avg": 135.72549438476562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7022747993469238, + "step": 1530, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.995867133140564 + }, + { + "episode": 24512, + "epoch": 0.44059388143940753, + "loss/policy_avg": 0.4086163341999054, + "lr": 2.706480061349693e-06, + "objective/entropy": 29.64532470703125, + "objective/kl": 11.605493545532227, + "objective/non_score_reward": -1.160549283027649, + "objective/rlhf_reward": -0.2421972215175625, + "objective/scores": 1.1, + "policy/approxkl_avg": 110.24431610107422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48122888803482056, + "step": 1531, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973434209823608 + }, + { + "episode": 24528, + "epoch": 0.4408814753567962, + "loss/policy_avg": 0.16401143372058868, + "lr": 2.7062883435582824e-06, + "objective/entropy": -106.093505859375, + "objective/kl": 18.429615020751953, + "objective/non_score_reward": -1.8429614305496216, + "objective/rlhf_reward": -2.9718456029891964, + "objective/scores": 1.1, + "policy/approxkl_avg": 90.48030090332031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8432134389877319, + "step": 1532, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999216914176941 + }, + { + "episode": 24544, + "epoch": 0.44116906927418487, + "loss/policy_avg": 0.30233433842658997, + "lr": 2.706096625766871e-06, + "objective/entropy": -94.45288848876953, + "objective/kl": 10.195063591003418, + "objective/non_score_reward": -1.0195064544677734, + "objective/rlhf_reward": -6.078025817871094, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.0919206142425537, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6552790403366089, + "step": 1533, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9992151260375977 + }, + { + "episode": 24560, + "epoch": 0.4414566631915735, + "loss/policy_avg": 0.30027535557746887, + "lr": 2.70590490797546e-06, + "objective/entropy": 39.50469207763672, + "objective/kl": 21.69940185546875, + "objective/non_score_reward": -2.169940233230591, + "objective/rlhf_reward": -8.279760932922363, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.9611849784851074, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5584852695465088, + "step": 1534, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0010085105895996 + }, + { + "episode": 24576, + "epoch": 0.44174425710896215, + "loss/policy_avg": 0.1084059327840805, + "lr": 2.7057131901840492e-06, + "objective/entropy": 315.66522216796875, + "objective/kl": 11.735557556152344, + "objective/non_score_reward": -1.1735557317733765, + "objective/rlhf_reward": -3.032363360345946, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 4.585368633270264, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 1.023706078529358, + "step": 1535, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004658699035645 + }, + { + "episode": 24592, + "epoch": 0.4420318510263508, + "loss/policy_avg": 0.8311107158660889, + "lr": 2.705521472392638e-06, + "objective/entropy": 112.27359008789062, + "objective/kl": 16.359058380126953, + "objective/non_score_reward": -1.6359058618545532, + "objective/rlhf_reward": -2.1436234623193737, + "objective/scores": 1.1, + "policy/approxkl_avg": 98.65286254882812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8164811134338379, + "step": 1536, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976773262023926 + }, + { + "episode": 24608, + "epoch": 0.4423194449437394, + "loss/policy_avg": 0.3270980715751648, + "lr": 2.7053297546012273e-06, + "objective/entropy": 95.81588745117188, + "objective/kl": 16.39149284362793, + "objective/non_score_reward": -1.6391494274139404, + "objective/rlhf_reward": -8.556597709655762, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.173675537109375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8174101114273071, + "step": 1537, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993128776550293 + }, + { + "episode": 24624, + "epoch": 0.44260703886112807, + "loss/policy_avg": 0.24724414944648743, + "lr": 2.705138036809816e-06, + "objective/entropy": 284.1396484375, + "objective/kl": 15.634368896484375, + "objective/non_score_reward": -1.5634369850158691, + "objective/rlhf_reward": -8.253747940063477, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.1526098251342773, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7458974123001099, + "step": 1538, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001605272293091 + }, + { + "episode": 24640, + "epoch": 0.44289463277851676, + "loss/policy_avg": 0.14971446990966797, + "lr": 2.704946319018405e-06, + "objective/entropy": -12.085342407226562, + "objective/kl": 18.393085479736328, + "objective/non_score_reward": -1.8393086194992065, + "objective/rlhf_reward": -5.409823308663304, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 53.39990234375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8056477308273315, + "step": 1539, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987678527832031 + }, + { + "episode": 24656, + "epoch": 0.4431822266959054, + "loss/policy_avg": 0.3267354965209961, + "lr": 2.704754601226994e-06, + "objective/entropy": 166.03140258789062, + "objective/kl": 17.804903030395508, + "objective/non_score_reward": -1.7804901599884033, + "objective/rlhf_reward": -6.721960878372192, + "objective/scores": 0.1, + "policy/approxkl_avg": 61.347267150878906, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.827207624912262, + "step": 1540, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992976188659668 + }, + { + "episode": 24672, + "epoch": 0.44346982061329404, + "loss/policy_avg": -0.25124239921569824, + "lr": 2.704562883435583e-06, + "objective/entropy": 106.05780029296875, + "objective/kl": 18.72496795654297, + "objective/non_score_reward": -1.8724967241287231, + "objective/rlhf_reward": -7.089986896514892, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.37659454345703, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5372323989868164, + "step": 1541, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000049352645874 + }, + { + "episode": 24688, + "epoch": 0.4437574145306827, + "loss/policy_avg": 0.31747931241989136, + "lr": 2.704371165644172e-06, + "objective/entropy": -78.14122772216797, + "objective/kl": 10.030508041381836, + "objective/non_score_reward": -1.0030508041381836, + "objective/rlhf_reward": -3.612203514575958, + "objective/scores": 0.1, + "policy/approxkl_avg": 8.71822738647461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5570470690727234, + "step": 1542, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980380535125732 + }, + { + "episode": 24704, + "epoch": 0.4440450084480713, + "loss/policy_avg": -0.013954512774944305, + "lr": 2.704179447852761e-06, + "objective/entropy": 226.41941833496094, + "objective/kl": 15.391945838928223, + "objective/non_score_reward": -1.5391945838928223, + "objective/rlhf_reward": -3.2330593212854595, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 33.7995491027832, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6941033005714417, + "step": 1543, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0005383491516113 + }, + { + "episode": 24720, + "epoch": 0.44433260236545996, + "loss/policy_avg": -0.052130766212940216, + "lr": 2.7039877300613498e-06, + "objective/entropy": -49.44288635253906, + "objective/kl": 14.369674682617188, + "objective/non_score_reward": -1.436967372894287, + "objective/rlhf_reward": -5.347869789600372, + "objective/scores": 0.1, + "policy/approxkl_avg": 12.564598083496094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6965159177780151, + "step": 1544, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0005674362182617 + }, + { + "episode": 24736, + "epoch": 0.4446201962828486, + "loss/policy_avg": 0.21237152814865112, + "lr": 2.7037960122699386e-06, + "objective/entropy": 195.41824340820312, + "objective/kl": 13.13949203491211, + "objective/non_score_reward": -1.3139491081237793, + "objective/rlhf_reward": -7.255796432495117, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.779157638549805, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8595893383026123, + "step": 1545, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007612705230713 + }, + { + "episode": 24752, + "epoch": 0.44490779020023724, + "loss/policy_avg": 0.37602826952934265, + "lr": 2.7036042944785274e-06, + "objective/entropy": 217.9283905029297, + "objective/kl": 16.096073150634766, + "objective/non_score_reward": -1.6096073389053345, + "objective/rlhf_reward": -8.43842887878418, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.7367372512817383, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7021983861923218, + "step": 1546, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986357688903809 + }, + { + "episode": 24768, + "epoch": 0.44519538411762594, + "loss/policy_avg": 0.2176741063594818, + "lr": 2.7034125766871166e-06, + "objective/entropy": 146.32022094726562, + "objective/kl": 19.497966766357422, + "objective/non_score_reward": -1.9497966766357422, + "objective/rlhf_reward": -7.399186706542968, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.904340744018555, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6778392791748047, + "step": 1547, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992797374725342 + }, + { + "episode": 24784, + "epoch": 0.4454829780350146, + "loss/policy_avg": -0.03819188103079796, + "lr": 2.7032208588957054e-06, + "objective/entropy": 22.281753540039062, + "objective/kl": 13.729183197021484, + "objective/non_score_reward": -1.3729183673858643, + "objective/rlhf_reward": -1.091673335433006, + "objective/scores": 1.1, + "policy/approxkl_avg": 45.58137512207031, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6482210159301758, + "step": 1548, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000016689300537 + }, + { + "episode": 24800, + "epoch": 0.4457705719524032, + "loss/policy_avg": -0.35607993602752686, + "lr": 2.7030291411042943e-06, + "objective/entropy": 161.14715576171875, + "objective/kl": 18.698495864868164, + "objective/non_score_reward": -1.869849443435669, + "objective/rlhf_reward": -7.0793980121612545, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.378082275390625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9166164398193359, + "step": 1549, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974842071533203 + }, + { + "episode": 24816, + "epoch": 0.44605816586979186, + "loss/policy_avg": 0.16531553864479065, + "lr": 2.7028374233128835e-06, + "objective/entropy": 16.770519256591797, + "objective/kl": 21.444007873535156, + "objective/non_score_reward": -2.1444008350372314, + "objective/rlhf_reward": -8.177603578567505, + "objective/scores": 0.1, + "policy/approxkl_avg": 61.63994598388672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.556962788105011, + "step": 1550, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987709522247314 + }, + { + "episode": 24832, + "epoch": 0.4463457597871805, + "loss/policy_avg": -0.10354608297348022, + "lr": 2.7026457055214723e-06, + "objective/entropy": 42.6910514831543, + "objective/kl": 17.690929412841797, + "objective/non_score_reward": -1.7690927982330322, + "objective/rlhf_reward": -6.676370894908905, + "objective/scores": 0.1, + "policy/approxkl_avg": 47.56449890136719, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6999937295913696, + "step": 1551, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000419855117798 + }, + { + "episode": 24848, + "epoch": 0.44663335370456914, + "loss/policy_avg": 0.22843822836875916, + "lr": 2.7024539877300615e-06, + "objective/entropy": 223.37420654296875, + "objective/kl": 16.745403289794922, + "objective/non_score_reward": -1.6745402812957764, + "objective/rlhf_reward": -6.298160946369171, + "objective/scores": 0.1, + "policy/approxkl_avg": 9.113785743713379, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.688583493232727, + "step": 1552, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004663467407227 + }, + { + "episode": 24864, + "epoch": 0.4469209476219578, + "loss/policy_avg": 0.5825582146644592, + "lr": 2.7022622699386503e-06, + "objective/entropy": -226.0932159423828, + "objective/kl": 10.318975448608398, + "objective/non_score_reward": -1.031897783279419, + "objective/rlhf_reward": -6.127591133117676, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.590059280395508, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6814849376678467, + "step": 1553, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0015769004821777 + }, + { + "episode": 24880, + "epoch": 0.4472085415393464, + "loss/policy_avg": 0.7355405688285828, + "lr": 2.702070552147239e-06, + "objective/entropy": -0.4903068542480469, + "objective/kl": 11.162042617797852, + "objective/non_score_reward": -1.1162042617797852, + "objective/rlhf_reward": -1.5410977348100867, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 43.610984802246094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.48920321464538574, + "step": 1554, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000331163406372 + }, + { + "episode": 24896, + "epoch": 0.4474961354567351, + "loss/policy_avg": 0.6175955533981323, + "lr": 2.7018788343558284e-06, + "objective/entropy": 27.28057861328125, + "objective/kl": 14.743072509765625, + "objective/non_score_reward": -1.4743072986602783, + "objective/rlhf_reward": -7.897229194641113, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.485166072845459, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6037850379943848, + "step": 1555, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994779825210571 + }, + { + "episode": 24912, + "epoch": 0.44778372937412375, + "loss/policy_avg": 0.4660176634788513, + "lr": 2.701687116564417e-06, + "objective/entropy": 81.15247344970703, + "objective/kl": 15.817729949951172, + "objective/non_score_reward": -1.581773042678833, + "objective/rlhf_reward": -3.9270920217037197, + "objective/scores": 0.6, + "policy/approxkl_avg": 70.5673599243164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6252831220626831, + "step": 1556, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998610258102417 + }, + { + "episode": 24928, + "epoch": 0.4480713232915124, + "loss/policy_avg": 0.07333136349916458, + "lr": 2.7014953987730064e-06, + "objective/entropy": 144.30224609375, + "objective/kl": 22.033649444580078, + "objective/non_score_reward": -2.203364849090576, + "objective/rlhf_reward": -10.813459396362305, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.9249267578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8037264347076416, + "step": 1557, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996355772018433 + }, + { + "episode": 24944, + "epoch": 0.44835891720890103, + "loss/policy_avg": 0.256188303232193, + "lr": 2.7013036809815952e-06, + "objective/entropy": -67.07438659667969, + "objective/kl": 7.778665065765381, + "objective/non_score_reward": -0.7778664827346802, + "objective/rlhf_reward": -5.111466407775879, + "objective/scores": -0.5, + "policy/approxkl_avg": 47.60340118408203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6475851535797119, + "step": 1558, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994537830352783 + }, + { + "episode": 24960, + "epoch": 0.44864651112628967, + "loss/policy_avg": 0.0924801379442215, + "lr": 2.701111963190184e-06, + "objective/entropy": 113.46358489990234, + "objective/kl": 14.019509315490723, + "objective/non_score_reward": -1.4019509553909302, + "objective/rlhf_reward": -7.6078033447265625, + "objective/scores": -0.5, + "policy/approxkl_avg": 25.744773864746094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6623624563217163, + "step": 1559, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986884593963623 + }, + { + "episode": 24976, + "epoch": 0.4489341050436783, + "loss/policy_avg": 0.1538938581943512, + "lr": 2.7009202453987733e-06, + "objective/entropy": 100.3080825805664, + "objective/kl": 18.584598541259766, + "objective/non_score_reward": -1.8584599494934082, + "objective/rlhf_reward": -5.033839440345764, + "objective/scores": 0.6, + "policy/approxkl_avg": 68.59663391113281, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6090136170387268, + "step": 1560, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989771842956543 + }, + { + "episode": 24992, + "epoch": 0.44922169896106695, + "loss/policy_avg": 0.485708087682724, + "lr": 2.700728527607362e-06, + "objective/entropy": -145.61587524414062, + "objective/kl": 9.420523643493652, + "objective/non_score_reward": -0.9420523643493652, + "objective/rlhf_reward": 0.6317904084920887, + "objective/scores": 1.1, + "policy/approxkl_avg": 15.837213516235352, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.732647180557251, + "step": 1561, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9991077184677124 + }, + { + "episode": 25008, + "epoch": 0.44950929287845565, + "loss/policy_avg": -0.20601963996887207, + "lr": 2.700536809815951e-06, + "objective/entropy": 38.85050582885742, + "objective/kl": 16.94233512878418, + "objective/non_score_reward": -1.6942335367202759, + "objective/rlhf_reward": -2.376934146881103, + "objective/scores": 1.1, + "policy/approxkl_avg": 44.557830810546875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6362634301185608, + "step": 1562, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0128636360168457 + }, + { + "episode": 25024, + "epoch": 0.4497968867958443, + "loss/policy_avg": 0.4293028712272644, + "lr": 2.70034509202454e-06, + "objective/entropy": -42.770118713378906, + "objective/kl": 18.898033142089844, + "objective/non_score_reward": -1.889803409576416, + "objective/rlhf_reward": -7.159213399887085, + "objective/scores": 0.1, + "policy/approxkl_avg": 46.76361846923828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8298807144165039, + "step": 1563, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000500202178955 + }, + { + "episode": 25040, + "epoch": 0.4500844807132329, + "loss/policy_avg": 0.29531529545783997, + "lr": 2.700153374233129e-06, + "objective/entropy": -97.281494140625, + "objective/kl": 11.071046829223633, + "objective/non_score_reward": -1.1071045398712158, + "objective/rlhf_reward": -0.028418457508086803, + "objective/scores": 1.1, + "policy/approxkl_avg": 3.9526071548461914, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7094585299491882, + "step": 1564, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980638027191162 + }, + { + "episode": 25056, + "epoch": 0.45037207463062157, + "loss/policy_avg": 0.046556033194065094, + "lr": 2.699961656441718e-06, + "objective/entropy": 107.26701354980469, + "objective/kl": 19.248638153076172, + "objective/non_score_reward": -1.9248639345169067, + "objective/rlhf_reward": -7.299455797672271, + "objective/scores": 0.1, + "policy/approxkl_avg": 113.94715881347656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.666810154914856, + "step": 1565, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988000392913818 + }, + { + "episode": 25072, + "epoch": 0.4506596685480102, + "loss/policy_avg": 0.012369789183139801, + "lr": 2.699769938650307e-06, + "objective/entropy": 147.94244384765625, + "objective/kl": 15.91103744506836, + "objective/non_score_reward": -1.5911036729812622, + "objective/rlhf_reward": -8.36441421508789, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.165550231933594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7558436393737793, + "step": 1566, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973013401031494 + }, + { + "episode": 25088, + "epoch": 0.45094726246539885, + "loss/policy_avg": -0.012968000024557114, + "lr": 2.6995782208588958e-06, + "objective/entropy": -157.74293518066406, + "objective/kl": 16.778770446777344, + "objective/non_score_reward": -1.6778768301010132, + "objective/rlhf_reward": -6.311507439613342, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.33392333984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7485702633857727, + "step": 1567, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000155448913574 + }, + { + "episode": 25104, + "epoch": 0.4512348563827875, + "loss/policy_avg": 0.2825145125389099, + "lr": 2.6993865030674846e-06, + "objective/entropy": -26.431015014648438, + "objective/kl": 15.26270866394043, + "objective/non_score_reward": -1.5262709856033325, + "objective/rlhf_reward": -4.5009638703504375, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 25.48444366455078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.834632158279419, + "step": 1568, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994863271713257 + }, + { + "episode": 25120, + "epoch": 0.4515224503001761, + "loss/policy_avg": -0.168730229139328, + "lr": 2.6991947852760734e-06, + "objective/entropy": 39.87802505493164, + "objective/kl": 21.178451538085938, + "objective/non_score_reward": -2.117845058441162, + "objective/rlhf_reward": -5.547661755920622, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.897341728210449, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7191267013549805, + "step": 1569, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0011539459228516 + }, + { + "episode": 25136, + "epoch": 0.4518100442175648, + "loss/policy_avg": 0.13665719330310822, + "lr": 2.6990030674846626e-06, + "objective/entropy": 118.00531005859375, + "objective/kl": 10.522141456604004, + "objective/non_score_reward": -1.0522141456604004, + "objective/rlhf_reward": -6.208856582641602, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.5498545169830322, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6503363847732544, + "step": 1570, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0017614364624023 + }, + { + "episode": 25152, + "epoch": 0.45209763813495346, + "loss/policy_avg": 0.1307908147573471, + "lr": 2.6988113496932514e-06, + "objective/entropy": -69.06759643554688, + "objective/kl": 17.492443084716797, + "objective/non_score_reward": -1.7492443323135376, + "objective/rlhf_reward": -8.996976852416992, + "objective/scores": -0.5, + "policy/approxkl_avg": 71.87448120117188, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.775144636631012, + "step": 1571, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9978597164154053 + }, + { + "episode": 25168, + "epoch": 0.4523852320523421, + "loss/policy_avg": 0.12863555550575256, + "lr": 2.6986196319018403e-06, + "objective/entropy": -15.985279083251953, + "objective/kl": 20.260404586791992, + "objective/non_score_reward": -2.0260403156280518, + "objective/rlhf_reward": -10.104161262512207, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.53135681152344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5794572830200195, + "step": 1572, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000314712524414 + }, + { + "episode": 25184, + "epoch": 0.45267282596973074, + "loss/policy_avg": 0.39225825667381287, + "lr": 2.6984279141104295e-06, + "objective/entropy": -145.51882934570312, + "objective/kl": 22.5490779876709, + "objective/non_score_reward": -2.2549078464508057, + "objective/rlhf_reward": -6.619630849361419, + "objective/scores": 0.6, + "policy/approxkl_avg": 99.43373107910156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.733991801738739, + "step": 1573, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997790813446045 + }, + { + "episode": 25200, + "epoch": 0.4529604198871194, + "loss/policy_avg": 0.23225891590118408, + "lr": 2.6982361963190183e-06, + "objective/entropy": 157.5059814453125, + "objective/kl": 13.41703987121582, + "objective/non_score_reward": -1.34170401096344, + "objective/rlhf_reward": -4.966816163063049, + "objective/scores": 0.1, + "policy/approxkl_avg": 71.42198181152344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5340787768363953, + "step": 1574, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0022149085998535 + }, + { + "episode": 25216, + "epoch": 0.453248013804508, + "loss/policy_avg": 0.09103409945964813, + "lr": 2.6980444785276075e-06, + "objective/entropy": 397.29486083984375, + "objective/kl": 17.116559982299805, + "objective/non_score_reward": -1.711656093597412, + "objective/rlhf_reward": -3.9229054197084636, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 85.59910583496094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9673483371734619, + "step": 1575, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997066855430603 + }, + { + "episode": 25232, + "epoch": 0.45353560772189666, + "loss/policy_avg": 0.659239649772644, + "lr": 2.6978527607361963e-06, + "objective/entropy": -50.505401611328125, + "objective/kl": 16.264190673828125, + "objective/non_score_reward": -1.6264190673828125, + "objective/rlhf_reward": -4.949417023864344, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 9.61915397644043, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53434157371521, + "step": 1576, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998645544052124 + }, + { + "episode": 25248, + "epoch": 0.45382320163928536, + "loss/policy_avg": 0.2389046847820282, + "lr": 2.697661042944785e-06, + "objective/entropy": 12.503677368164062, + "objective/kl": 15.69602108001709, + "objective/non_score_reward": -1.569602131843567, + "objective/rlhf_reward": -1.878408348560333, + "objective/scores": 1.1, + "policy/approxkl_avg": 9.81786060333252, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8174733519554138, + "step": 1577, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997225046157837 + }, + { + "episode": 25264, + "epoch": 0.454110795556674, + "loss/policy_avg": 0.06463336944580078, + "lr": 2.6974693251533744e-06, + "objective/entropy": 14.687973022460938, + "objective/kl": 16.26601791381836, + "objective/non_score_reward": -1.626602053642273, + "objective/rlhf_reward": -8.50640869140625, + "objective/scores": -0.5, + "policy/approxkl_avg": 52.611732482910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7420953512191772, + "step": 1578, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989168643951416 + }, + { + "episode": 25280, + "epoch": 0.45439838947406264, + "loss/policy_avg": 0.3006048798561096, + "lr": 2.697277607361963e-06, + "objective/entropy": 121.62173461914062, + "objective/kl": 17.033985137939453, + "objective/non_score_reward": -1.7033987045288086, + "objective/rlhf_reward": -6.413594579696655, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.945178985595703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.840303897857666, + "step": 1579, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979677200317383 + }, + { + "episode": 25296, + "epoch": 0.4546859833914513, + "loss/policy_avg": -0.2543651759624481, + "lr": 2.6970858895705524e-06, + "objective/entropy": 23.071609497070312, + "objective/kl": 16.931270599365234, + "objective/non_score_reward": -1.6931270360946655, + "objective/rlhf_reward": -6.372508069872856, + "objective/scores": 0.1, + "policy/approxkl_avg": 4.906126022338867, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.81708824634552, + "step": 1580, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000901699066162 + }, + { + "episode": 25312, + "epoch": 0.4549735773088399, + "loss/policy_avg": 0.41243085265159607, + "lr": 2.6968941717791412e-06, + "objective/entropy": 356.097412109375, + "objective/kl": 16.318756103515625, + "objective/non_score_reward": -1.6318755149841309, + "objective/rlhf_reward": -3.603782948793146, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 52.568607330322266, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9110679030418396, + "step": 1581, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000410318374634 + }, + { + "episode": 25328, + "epoch": 0.45526117122622856, + "loss/policy_avg": 0.07954712212085724, + "lr": 2.69670245398773e-06, + "objective/entropy": 114.1132583618164, + "objective/kl": 20.863964080810547, + "objective/non_score_reward": -2.0863962173461914, + "objective/rlhf_reward": -6.741465274159031, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 86.94965362548828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6331667900085449, + "step": 1582, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974026679992676 + }, + { + "episode": 25344, + "epoch": 0.4555487651436172, + "loss/policy_avg": 0.01582658290863037, + "lr": 2.6965107361963193e-06, + "objective/entropy": 195.87881469726562, + "objective/kl": 17.72394561767578, + "objective/non_score_reward": -1.7723946571350098, + "objective/rlhf_reward": -9.089578628540039, + "objective/scores": -0.5, + "policy/approxkl_avg": 32.234439849853516, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8881067633628845, + "step": 1583, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999032735824585 + }, + { + "episode": 25360, + "epoch": 0.45583635906100584, + "loss/policy_avg": 0.36229202151298523, + "lr": 2.696319018404908e-06, + "objective/entropy": 23.606246948242188, + "objective/kl": 19.955965042114258, + "objective/non_score_reward": -1.9955966472625732, + "objective/rlhf_reward": -9.982385635375977, + "objective/scores": -0.5, + "policy/approxkl_avg": 53.62287139892578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.33281606435775757, + "step": 1584, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996920824050903 + }, + { + "episode": 25376, + "epoch": 0.45612395297839453, + "loss/policy_avg": 0.26224541664123535, + "lr": 2.696127300613497e-06, + "objective/entropy": -22.723011016845703, + "objective/kl": 14.956771850585938, + "objective/non_score_reward": -1.4956772327423096, + "objective/rlhf_reward": -7.982708930969238, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.4624900817871094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5113921165466309, + "step": 1585, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998342514038086 + }, + { + "episode": 25392, + "epoch": 0.45641154689578317, + "loss/policy_avg": 0.4027129113674164, + "lr": 2.695935582822086e-06, + "objective/entropy": -29.9932861328125, + "objective/kl": 14.198205947875977, + "objective/non_score_reward": -1.4198204278945923, + "objective/rlhf_reward": -3.5565754196801525, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 149.37881469726562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7940120697021484, + "step": 1586, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996426105499268 + }, + { + "episode": 25408, + "epoch": 0.4566991408131718, + "loss/policy_avg": 0.4442477226257324, + "lr": 2.695743865030675e-06, + "objective/entropy": 2.5545082092285156, + "objective/kl": 12.904587745666504, + "objective/non_score_reward": -1.2904589176177979, + "objective/rlhf_reward": -4.761835730075836, + "objective/scores": 0.1, + "policy/approxkl_avg": 44.589908599853516, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.44873154163360596, + "step": 1587, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9959895610809326 + }, + { + "episode": 25424, + "epoch": 0.45698673473056045, + "loss/policy_avg": 0.2857421934604645, + "lr": 2.695552147239264e-06, + "objective/entropy": 133.09530639648438, + "objective/kl": 17.154420852661133, + "objective/non_score_reward": -1.715442180633545, + "objective/rlhf_reward": -4.739062073008094, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 129.16445922851562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7996494174003601, + "step": 1588, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9953253269195557 + }, + { + "episode": 25440, + "epoch": 0.4572743286479491, + "loss/policy_avg": 0.122508704662323, + "lr": 2.695360429447853e-06, + "objective/entropy": 76.19819641113281, + "objective/kl": 12.06997299194336, + "objective/non_score_reward": -1.206997275352478, + "objective/rlhf_reward": -4.4279892727732655, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.52509880065918, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.739288866519928, + "step": 1589, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986529350280762 + }, + { + "episode": 25456, + "epoch": 0.45756192256533773, + "loss/policy_avg": 0.4644879102706909, + "lr": 2.695168711656442e-06, + "objective/entropy": 124.99449157714844, + "objective/kl": 16.30022430419922, + "objective/non_score_reward": -1.6300225257873535, + "objective/rlhf_reward": -4.397383751646553, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 10.321215629577637, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6177806854248047, + "step": 1590, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980406761169434 + }, + { + "episode": 25472, + "epoch": 0.45784951648272637, + "loss/policy_avg": 0.24397248029708862, + "lr": 2.6949769938650306e-06, + "objective/entropy": 34.64955139160156, + "objective/kl": 14.790996551513672, + "objective/non_score_reward": -1.4790997505187988, + "objective/rlhf_reward": -5.5163992702960964, + "objective/scores": 0.1, + "policy/approxkl_avg": 120.2115707397461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8772210478782654, + "step": 1591, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973468780517578 + }, + { + "episode": 25488, + "epoch": 0.458137110400115, + "loss/policy_avg": 0.3860127925872803, + "lr": 2.6947852760736194e-06, + "objective/entropy": -19.67723846435547, + "objective/kl": 14.165413856506348, + "objective/non_score_reward": -1.4165414571762085, + "objective/rlhf_reward": -5.266165888309478, + "objective/scores": 0.1, + "policy/approxkl_avg": 12.3641357421875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6394959688186646, + "step": 1592, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994362592697144 + }, + { + "episode": 25504, + "epoch": 0.4584247043175037, + "loss/policy_avg": 0.06918300688266754, + "lr": 2.6945935582822086e-06, + "objective/entropy": 114.20576477050781, + "objective/kl": 16.480592727661133, + "objective/non_score_reward": -1.6480592489242554, + "objective/rlhf_reward": -2.192236995697021, + "objective/scores": 1.1, + "policy/approxkl_avg": 68.93272399902344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5571388006210327, + "step": 1593, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999291181564331 + }, + { + "episode": 25520, + "epoch": 0.45871229823489235, + "loss/policy_avg": 0.029633134603500366, + "lr": 2.6944018404907975e-06, + "objective/entropy": 158.27218627929688, + "objective/kl": 13.224921226501465, + "objective/non_score_reward": -1.3224921226501465, + "objective/rlhf_reward": -7.289968490600586, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.776941299438477, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6619477272033691, + "step": 1594, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0042223930358887 + }, + { + "episode": 25536, + "epoch": 0.458999892152281, + "loss/policy_avg": 0.4518873989582062, + "lr": 2.6942101226993867e-06, + "objective/entropy": 130.73275756835938, + "objective/kl": 12.00938606262207, + "objective/non_score_reward": -1.2009387016296387, + "objective/rlhf_reward": -2.403754478693008, + "objective/scores": 0.6, + "policy/approxkl_avg": 24.144432067871094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4357760548591614, + "step": 1595, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999908208847046 + }, + { + "episode": 25552, + "epoch": 0.4592874860696696, + "loss/policy_avg": -0.008014561608433723, + "lr": 2.6940184049079755e-06, + "objective/entropy": -63.465999603271484, + "objective/kl": 16.632591247558594, + "objective/non_score_reward": -1.6632592678070068, + "objective/rlhf_reward": -2.2530369520187374, + "objective/scores": 1.1, + "policy/approxkl_avg": 73.7203140258789, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6869769096374512, + "step": 1596, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0008316040039062 + }, + { + "episode": 25568, + "epoch": 0.45957507998705827, + "loss/policy_avg": 0.11674131453037262, + "lr": 2.6938266871165643e-06, + "objective/entropy": -240.97103881835938, + "objective/kl": 13.844555854797363, + "objective/non_score_reward": -1.384455680847168, + "objective/rlhf_reward": -1.1378224849700924, + "objective/scores": 1.1, + "policy/approxkl_avg": 66.28314208984375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5940455198287964, + "step": 1597, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9971486330032349 + }, + { + "episode": 25584, + "epoch": 0.4598626739044469, + "loss/policy_avg": 0.22572100162506104, + "lr": 2.6936349693251535e-06, + "objective/entropy": 73.80177307128906, + "objective/kl": 17.256254196166992, + "objective/non_score_reward": -1.7256255149841309, + "objective/rlhf_reward": -6.502502059936523, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.520421981811523, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.44892245531082153, + "step": 1598, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999833583831787 + }, + { + "episode": 25600, + "epoch": 0.46015026782183555, + "loss/policy_avg": 0.17625027894973755, + "lr": 2.6934432515337424e-06, + "objective/entropy": 64.5656967163086, + "objective/kl": 15.18729019165039, + "objective/non_score_reward": -1.5187289714813232, + "objective/rlhf_reward": -5.674916064739227, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.129301071166992, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.658516526222229, + "step": 1599, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983505010604858 + }, + { + "episode": 25616, + "epoch": 0.46043786173922424, + "loss/policy_avg": -0.21334171295166016, + "lr": 2.693251533742331e-06, + "objective/entropy": -166.7024383544922, + "objective/kl": 14.429752349853516, + "objective/non_score_reward": -1.4429752826690674, + "objective/rlhf_reward": -3.9470721438256016, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 10.913631439208984, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5916569232940674, + "step": 1600, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.005368709564209 + }, + { + "episode": 25632, + "epoch": 0.4607254556566129, + "loss/policy_avg": 0.3519183099269867, + "lr": 2.6930598159509204e-06, + "objective/entropy": 203.21722412109375, + "objective/kl": 9.83327579498291, + "objective/non_score_reward": -0.9833276271820068, + "objective/rlhf_reward": 0.4666895508766178, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.112152099609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7379715442657471, + "step": 1601, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004913806915283 + }, + { + "episode": 25648, + "epoch": 0.4610130495740015, + "loss/policy_avg": -0.3812810778617859, + "lr": 2.692868098159509e-06, + "objective/entropy": -146.57159423828125, + "objective/kl": 11.350961685180664, + "objective/non_score_reward": -1.1350963115692139, + "objective/rlhf_reward": -4.140384978055954, + "objective/scores": 0.1, + "policy/approxkl_avg": 46.152496337890625, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.651496171951294, + "step": 1602, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0040106773376465 + }, + { + "episode": 25664, + "epoch": 0.46130064349139016, + "loss/policy_avg": 0.1941705346107483, + "lr": 2.6926763803680984e-06, + "objective/entropy": 66.99221801757812, + "objective/kl": 10.873661041259766, + "objective/non_score_reward": -1.0873661041259766, + "objective/rlhf_reward": -2.6876049838667972, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 23.330978393554688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5211083292961121, + "step": 1603, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978305101394653 + }, + { + "episode": 25680, + "epoch": 0.4615882374087788, + "loss/policy_avg": 0.5209361910820007, + "lr": 2.6924846625766872e-06, + "objective/entropy": -39.09912109375, + "objective/kl": 7.698319435119629, + "objective/non_score_reward": -0.7698320150375366, + "objective/rlhf_reward": -0.9566216937461234, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 52.248313903808594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.47457823157310486, + "step": 1604, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983742237091064 + }, + { + "episode": 25696, + "epoch": 0.46187583132616744, + "loss/policy_avg": 0.35121509432792664, + "lr": 2.692292944785276e-06, + "objective/entropy": 164.5242919921875, + "objective/kl": 22.50076675415039, + "objective/non_score_reward": -2.2500767707824707, + "objective/rlhf_reward": -11.000307083129883, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.89537811279297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.594908595085144, + "step": 1605, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997105598449707 + }, + { + "episode": 25712, + "epoch": 0.4621634252435561, + "loss/policy_avg": 0.3918079733848572, + "lr": 2.6921012269938653e-06, + "objective/entropy": -41.62696838378906, + "objective/kl": 12.502820014953613, + "objective/non_score_reward": -1.2502820491790771, + "objective/rlhf_reward": -0.6011278986930844, + "objective/scores": 1.1, + "policy/approxkl_avg": 9.170791625976562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.73177570104599, + "step": 1606, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988858699798584 + }, + { + "episode": 25728, + "epoch": 0.4624510191609447, + "loss/policy_avg": 0.12750157713890076, + "lr": 2.691909509202454e-06, + "objective/entropy": 116.52877807617188, + "objective/kl": 14.242040634155273, + "objective/non_score_reward": -1.4242041110992432, + "objective/rlhf_reward": -5.296816742420196, + "objective/scores": 0.1, + "policy/approxkl_avg": 101.38616943359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7711246013641357, + "step": 1607, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977200031280518 + }, + { + "episode": 25744, + "epoch": 0.4627386130783334, + "loss/policy_avg": 0.4134465157985687, + "lr": 2.6917177914110433e-06, + "objective/entropy": 387.149658203125, + "objective/kl": 15.058012008666992, + "objective/non_score_reward": -1.5058010816574097, + "objective/rlhf_reward": -1.6232041329145428, + "objective/scores": 1.1, + "policy/approxkl_avg": 60.06370544433594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9526330828666687, + "step": 1608, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982794523239136 + }, + { + "episode": 25760, + "epoch": 0.46302620699572206, + "loss/policy_avg": 0.15410056710243225, + "lr": 2.691526073619632e-06, + "objective/entropy": 74.04141998291016, + "objective/kl": 15.319889068603516, + "objective/non_score_reward": -1.5319888591766357, + "objective/rlhf_reward": -3.2042362138044567, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 130.09005737304688, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7068464756011963, + "step": 1609, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0008046627044678 + }, + { + "episode": 25776, + "epoch": 0.4633138009131107, + "loss/policy_avg": -0.26566749811172485, + "lr": 2.691334355828221e-06, + "objective/entropy": 124.87033081054688, + "objective/kl": 12.6633939743042, + "objective/non_score_reward": -1.2663395404815674, + "objective/rlhf_reward": -2.9426516316094737, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 12.721351623535156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.644338071346283, + "step": 1610, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003638982772827 + }, + { + "episode": 25792, + "epoch": 0.46360139483049934, + "loss/policy_avg": 0.10031799972057343, + "lr": 2.69114263803681e-06, + "objective/entropy": 96.79659271240234, + "objective/kl": 15.103389739990234, + "objective/non_score_reward": -1.5103390216827393, + "objective/rlhf_reward": -5.641355729103088, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.584495544433594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.633362889289856, + "step": 1611, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000192165374756 + }, + { + "episode": 25808, + "epoch": 0.463888988747888, + "loss/policy_avg": 0.839004635810852, + "lr": 2.6909509202453986e-06, + "objective/entropy": 9.612741470336914, + "objective/kl": 17.089344024658203, + "objective/non_score_reward": -1.7089345455169678, + "objective/rlhf_reward": -2.4357380032539364, + "objective/scores": 1.1, + "policy/approxkl_avg": 26.429393768310547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5412091016769409, + "step": 1612, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982448816299438 + }, + { + "episode": 25824, + "epoch": 0.4641765826652766, + "loss/policy_avg": 0.4816775918006897, + "lr": 2.690759202453988e-06, + "objective/entropy": 28.11272430419922, + "objective/kl": 15.321728706359863, + "objective/non_score_reward": -1.5321729183197021, + "objective/rlhf_reward": -5.728691792488098, + "objective/scores": 0.1, + "policy/approxkl_avg": 25.030738830566406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.735893189907074, + "step": 1613, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991583824157715 + }, + { + "episode": 25840, + "epoch": 0.46446417658266526, + "loss/policy_avg": -0.004038345068693161, + "lr": 2.6905674846625766e-06, + "objective/entropy": 103.99654388427734, + "objective/kl": 11.92760944366455, + "objective/non_score_reward": -1.192760944366455, + "objective/rlhf_reward": -6.771043300628662, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.085834503173828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5473529100418091, + "step": 1614, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000544309616089 + }, + { + "episode": 25856, + "epoch": 0.46475177050005395, + "loss/policy_avg": 0.09000711143016815, + "lr": 2.6903757668711654e-06, + "objective/entropy": 76.68963623046875, + "objective/kl": 12.239154815673828, + "objective/non_score_reward": -1.2239153385162354, + "objective/rlhf_reward": -4.495661205053329, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.7319424152374268, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.648979663848877, + "step": 1615, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000762939453125 + }, + { + "episode": 25872, + "epoch": 0.4650393644174426, + "loss/policy_avg": 0.6946203708648682, + "lr": 2.6901840490797547e-06, + "objective/entropy": 210.3482666015625, + "objective/kl": 14.363799095153809, + "objective/non_score_reward": -1.4363799095153809, + "objective/rlhf_reward": -1.3455196976661679, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.82388687133789, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7468467950820923, + "step": 1616, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9972286224365234 + }, + { + "episode": 25888, + "epoch": 0.46532695833483123, + "loss/policy_avg": 0.28084972500801086, + "lr": 2.6899923312883435e-06, + "objective/entropy": 183.54605102539062, + "objective/kl": 19.414718627929688, + "objective/non_score_reward": -1.9414719343185425, + "objective/rlhf_reward": -7.36588761806488, + "objective/scores": 0.1, + "policy/approxkl_avg": 69.62510681152344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6482871770858765, + "step": 1617, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9941153526306152 + }, + { + "episode": 25904, + "epoch": 0.46561455225221987, + "loss/policy_avg": 0.06988838315010071, + "lr": 2.6898006134969327e-06, + "objective/entropy": -54.10508728027344, + "objective/kl": 15.33044147491455, + "objective/non_score_reward": -1.5330440998077393, + "objective/rlhf_reward": -8.132176399230957, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.571006774902344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7482419013977051, + "step": 1618, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.00229549407959 + }, + { + "episode": 25920, + "epoch": 0.4659021461696085, + "loss/policy_avg": 0.20038458704948425, + "lr": 2.6896088957055215e-06, + "objective/entropy": 23.637174606323242, + "objective/kl": 20.115951538085938, + "objective/non_score_reward": -2.0115952491760254, + "objective/rlhf_reward": -10.046380996704102, + "objective/scores": -0.5, + "policy/approxkl_avg": 261.33624267578125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6856141090393066, + "step": 1619, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9966861009597778 + }, + { + "episode": 25936, + "epoch": 0.46618974008699715, + "loss/policy_avg": 0.03766409680247307, + "lr": 2.6894171779141103e-06, + "objective/entropy": 260.55914306640625, + "objective/kl": 17.825664520263672, + "objective/non_score_reward": -1.7825665473937988, + "objective/rlhf_reward": -4.206547056080076, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 49.903568267822266, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7212458848953247, + "step": 1620, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986686706542969 + }, + { + "episode": 25952, + "epoch": 0.4664773340043858, + "loss/policy_avg": 0.1514892280101776, + "lr": 2.6892254601226995e-06, + "objective/entropy": 49.057987213134766, + "objective/kl": 18.460351943969727, + "objective/non_score_reward": -1.8460350036621094, + "objective/rlhf_reward": -5.7222807459241025, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 94.69145202636719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6090638637542725, + "step": 1621, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974236488342285 + }, + { + "episode": 25968, + "epoch": 0.46676492792177443, + "loss/policy_avg": 0.7067244052886963, + "lr": 2.6890337423312884e-06, + "objective/entropy": 25.550270080566406, + "objective/kl": 17.252092361450195, + "objective/non_score_reward": -1.7252092361450195, + "objective/rlhf_reward": -8.900836944580078, + "objective/scores": -0.5, + "policy/approxkl_avg": 169.4148712158203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7647451162338257, + "step": 1622, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988857507705688 + }, + { + "episode": 25984, + "epoch": 0.4670525218391631, + "loss/policy_avg": 0.020154371857643127, + "lr": 2.688842024539877e-06, + "objective/entropy": -240.26010131835938, + "objective/kl": 21.083240509033203, + "objective/non_score_reward": -2.1083240509033203, + "objective/rlhf_reward": -8.033295547962188, + "objective/scores": 0.1, + "policy/approxkl_avg": 47.80495071411133, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5152453184127808, + "step": 1623, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998983383178711 + }, + { + "episode": 26000, + "epoch": 0.46734011575655177, + "loss/policy_avg": 0.4304991364479065, + "lr": 2.6886503067484664e-06, + "objective/entropy": -155.2620849609375, + "objective/kl": 16.325481414794922, + "objective/non_score_reward": -1.6325483322143555, + "objective/rlhf_reward": -6.130193388462066, + "objective/scores": 0.1, + "policy/approxkl_avg": 99.36180114746094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6268974542617798, + "step": 1624, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9972355365753174 + }, + { + "episode": 26016, + "epoch": 0.4676277096739404, + "loss/policy_avg": 0.20302683115005493, + "lr": 2.688458588957055e-06, + "objective/entropy": 189.250244140625, + "objective/kl": 14.729907035827637, + "objective/non_score_reward": -1.4729907512664795, + "objective/rlhf_reward": -3.7692564151444774, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 34.47994613647461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5767304301261902, + "step": 1625, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9969828128814697 + }, + { + "episode": 26032, + "epoch": 0.46791530359132905, + "loss/policy_avg": 0.1673620343208313, + "lr": 2.6882668711656444e-06, + "objective/entropy": 190.11880493164062, + "objective/kl": 16.155166625976562, + "objective/non_score_reward": -1.6155165433883667, + "objective/rlhf_reward": -8.462066650390625, + "objective/scores": -0.5, + "policy/approxkl_avg": 25.304584503173828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9624060392379761, + "step": 1626, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002522468566895 + }, + { + "episode": 26048, + "epoch": 0.4682028975087177, + "loss/policy_avg": 0.10353785753250122, + "lr": 2.6880751533742333e-06, + "objective/entropy": 204.98744201660156, + "objective/kl": 15.436996459960938, + "objective/non_score_reward": -1.5436995029449463, + "objective/rlhf_reward": -4.34996944216163, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 142.20309448242188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7310298681259155, + "step": 1627, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000544786453247 + }, + { + "episode": 26064, + "epoch": 0.4684904914261063, + "loss/policy_avg": 0.12273672968149185, + "lr": 2.687883435582822e-06, + "objective/entropy": -68.6159896850586, + "objective/kl": 19.266712188720703, + "objective/non_score_reward": -1.9266713857650757, + "objective/rlhf_reward": -3.306685423851013, + "objective/scores": 1.1, + "policy/approxkl_avg": 71.09552001953125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8100576400756836, + "step": 1628, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9961222410202026 + }, + { + "episode": 26080, + "epoch": 0.46877808534349497, + "loss/policy_avg": 0.047170355916023254, + "lr": 2.6876917177914113e-06, + "objective/entropy": 78.24015808105469, + "objective/kl": 13.48752212524414, + "objective/non_score_reward": -1.3487520217895508, + "objective/rlhf_reward": -4.995008414983749, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.823776245117188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8033530116081238, + "step": 1629, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998892545700073 + }, + { + "episode": 26096, + "epoch": 0.4690656792608836, + "loss/policy_avg": 0.20813743770122528, + "lr": 2.6875e-06, + "objective/entropy": -0.6008510589599609, + "objective/kl": 20.10837745666504, + "objective/non_score_reward": -2.0108377933502197, + "objective/rlhf_reward": -5.11963203990576, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 29.439212799072266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.533843994140625, + "step": 1630, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998657703399658 + }, + { + "episode": 26112, + "epoch": 0.4693532731782723, + "loss/policy_avg": 0.11692376434803009, + "lr": 2.6873082822085893e-06, + "objective/entropy": 47.902130126953125, + "objective/kl": 17.010051727294922, + "objective/non_score_reward": -1.7010051012039185, + "objective/rlhf_reward": -8.804019927978516, + "objective/scores": -0.5, + "policy/approxkl_avg": 54.74475860595703, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.614177942276001, + "step": 1631, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000200033187866 + }, + { + "episode": 26128, + "epoch": 0.46964086709566094, + "loss/policy_avg": 0.28958654403686523, + "lr": 2.687116564417178e-06, + "objective/entropy": 1.1801567077636719, + "objective/kl": 17.849510192871094, + "objective/non_score_reward": -1.7849509716033936, + "objective/rlhf_reward": -5.017097475306068, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 74.95404052734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6543114185333252, + "step": 1632, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001051902770996 + }, + { + "episode": 26144, + "epoch": 0.4699284610130496, + "loss/policy_avg": 0.48122167587280273, + "lr": 2.686924846625767e-06, + "objective/entropy": 181.40182495117188, + "objective/kl": 18.407989501953125, + "objective/non_score_reward": -1.8407987356185913, + "objective/rlhf_reward": -5.759075019423085, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 59.3737678527832, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6886003017425537, + "step": 1633, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996335506439209 + }, + { + "episode": 26160, + "epoch": 0.4702160549304382, + "loss/policy_avg": 0.09453214704990387, + "lr": 2.6867331288343558e-06, + "objective/entropy": 144.73582458496094, + "objective/kl": 15.524151802062988, + "objective/non_score_reward": -1.552415370941162, + "objective/rlhf_reward": -4.605541441504078, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 45.74516677856445, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.943570613861084, + "step": 1634, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998361349105835 + }, + { + "episode": 26176, + "epoch": 0.47050364884782686, + "loss/policy_avg": 0.6591281890869141, + "lr": 2.6865414110429446e-06, + "objective/entropy": -101.7740478515625, + "objective/kl": 19.701892852783203, + "objective/non_score_reward": -1.970189094543457, + "objective/rlhf_reward": -7.480756616592407, + "objective/scores": 0.1, + "policy/approxkl_avg": 152.74351501464844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6178215742111206, + "step": 1635, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983309507369995 + }, + { + "episode": 26192, + "epoch": 0.4707912427652155, + "loss/policy_avg": 0.203212171792984, + "lr": 2.686349693251534e-06, + "objective/entropy": -106.49153137207031, + "objective/kl": 14.910431861877441, + "objective/non_score_reward": -1.491043210029602, + "objective/rlhf_reward": -4.0167617005872085, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 39.786354064941406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8822001814842224, + "step": 1636, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9977169036865234 + }, + { + "episode": 26208, + "epoch": 0.47107883668260414, + "loss/policy_avg": -0.11033344268798828, + "lr": 2.6861579754601226e-06, + "objective/entropy": 131.6498565673828, + "objective/kl": 16.115337371826172, + "objective/non_score_reward": -1.6115338802337646, + "objective/rlhf_reward": -8.446135520935059, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.540287017822266, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6810669898986816, + "step": 1637, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0009217262268066 + }, + { + "episode": 26224, + "epoch": 0.47136643059999284, + "loss/policy_avg": 0.09941694140434265, + "lr": 2.6859662576687114e-06, + "objective/entropy": -60.27667236328125, + "objective/kl": 13.570518493652344, + "objective/non_score_reward": -1.357051968574524, + "objective/rlhf_reward": -5.028207635879516, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.635757446289062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7173567414283752, + "step": 1638, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9975781440734863 + }, + { + "episode": 26240, + "epoch": 0.4716540245173815, + "loss/policy_avg": 0.6801759600639343, + "lr": 2.6857745398773007e-06, + "objective/entropy": 108.41455078125, + "objective/kl": 12.158008575439453, + "objective/non_score_reward": -1.2158007621765137, + "objective/rlhf_reward": -6.863203048706055, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.411354064941406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6560934782028198, + "step": 1639, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0010602474212646 + }, + { + "episode": 26256, + "epoch": 0.4719416184347701, + "loss/policy_avg": 1.7853608131408691, + "lr": 2.6855828220858895e-06, + "objective/entropy": 137.3661346435547, + "objective/kl": 11.159649848937988, + "objective/non_score_reward": -1.1159650087356567, + "objective/rlhf_reward": -0.06385996043682063, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.4167832136154175, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5946769714355469, + "step": 1640, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.006664276123047 + }, + { + "episode": 26272, + "epoch": 0.47222921235215876, + "loss/policy_avg": 0.48221543431282043, + "lr": 2.6853911042944787e-06, + "objective/entropy": 208.98565673828125, + "objective/kl": 11.308876037597656, + "objective/non_score_reward": -1.130887746810913, + "objective/rlhf_reward": -0.12355086803436244, + "objective/scores": 1.1, + "policy/approxkl_avg": 3.408721446990967, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7396516799926758, + "step": 1641, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003695249557495 + }, + { + "episode": 26288, + "epoch": 0.4725168062695474, + "loss/policy_avg": 0.3023480176925659, + "lr": 2.6851993865030675e-06, + "objective/entropy": 349.9120178222656, + "objective/kl": 21.112802505493164, + "objective/non_score_reward": -2.1112802028656006, + "objective/rlhf_reward": -6.045120692253112, + "objective/scores": 0.6, + "policy/approxkl_avg": 64.028564453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9350632429122925, + "step": 1642, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987595081329346 + }, + { + "episode": 26304, + "epoch": 0.47280440018693604, + "loss/policy_avg": 0.23029407858848572, + "lr": 2.6850076687116563e-06, + "objective/entropy": 73.98474884033203, + "objective/kl": 15.536746978759766, + "objective/non_score_reward": -1.5536746978759766, + "objective/rlhf_reward": -3.8146989107131954, + "objective/scores": 0.6, + "policy/approxkl_avg": 24.919830322265625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4860690236091614, + "step": 1643, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997631311416626 + }, + { + "episode": 26320, + "epoch": 0.4730919941043247, + "loss/policy_avg": 0.10321778059005737, + "lr": 2.6848159509202456e-06, + "objective/entropy": -64.98163604736328, + "objective/kl": 11.039989471435547, + "objective/non_score_reward": -1.1039988994598389, + "objective/rlhf_reward": -2.468584398703511, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 18.096717834472656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5158751010894775, + "step": 1644, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.00130558013916 + }, + { + "episode": 26336, + "epoch": 0.4733795880217133, + "loss/policy_avg": -0.002828633412718773, + "lr": 2.6846242331288344e-06, + "objective/entropy": -9.201019287109375, + "objective/kl": 10.427934646606445, + "objective/non_score_reward": -1.0427935123443604, + "objective/rlhf_reward": -6.171174049377441, + "objective/scores": -0.5, + "policy/approxkl_avg": 16.601865768432617, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.750141978263855, + "step": 1645, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995884895324707 + }, + { + "episode": 26352, + "epoch": 0.473667181939102, + "loss/policy_avg": 1.0678659677505493, + "lr": 2.6844325153374236e-06, + "objective/entropy": -81.63957977294922, + "objective/kl": 19.900205612182617, + "objective/non_score_reward": -1.9900203943252563, + "objective/rlhf_reward": -5.036362592817518, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 49.511234283447266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7680249214172363, + "step": 1646, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9990290403366089 + }, + { + "episode": 26368, + "epoch": 0.47395477585649065, + "loss/policy_avg": -0.299903929233551, + "lr": 2.6842407975460124e-06, + "objective/entropy": 132.30357360839844, + "objective/kl": 10.713934898376465, + "objective/non_score_reward": -1.071393609046936, + "objective/rlhf_reward": -3.8855744212865826, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.781431198120117, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5183275938034058, + "step": 1647, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997494220733643 + }, + { + "episode": 26384, + "epoch": 0.4742423697738793, + "loss/policy_avg": 0.9550938606262207, + "lr": 2.6840490797546012e-06, + "objective/entropy": 50.97139358520508, + "objective/kl": 15.559367179870605, + "objective/non_score_reward": -1.5559368133544922, + "objective/rlhf_reward": -3.300028239132139, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 66.05949401855469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5880213975906372, + "step": 1648, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9958624839782715 + }, + { + "episode": 26400, + "epoch": 0.47452996369126793, + "loss/policy_avg": 0.3384305238723755, + "lr": 2.6838573619631904e-06, + "objective/entropy": -16.053207397460938, + "objective/kl": 14.48770523071289, + "objective/non_score_reward": -1.4487704038619995, + "objective/rlhf_reward": -7.795081615447998, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.371421813964844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.80098557472229, + "step": 1649, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996049165725708 + }, + { + "episode": 26416, + "epoch": 0.47481755760865657, + "loss/policy_avg": 0.041066974401474, + "lr": 2.6836656441717793e-06, + "objective/entropy": 183.5389862060547, + "objective/kl": 16.953853607177734, + "objective/non_score_reward": -1.695385456085205, + "objective/rlhf_reward": -8.78154182434082, + "objective/scores": -0.5, + "policy/approxkl_avg": 52.10731506347656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5555412173271179, + "step": 1650, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003539562225342 + }, + { + "episode": 26432, + "epoch": 0.4751051515260452, + "loss/policy_avg": 0.15615364909172058, + "lr": 2.683473926380368e-06, + "objective/entropy": 14.035999298095703, + "objective/kl": 16.663122177124023, + "objective/non_score_reward": -1.666312336921692, + "objective/rlhf_reward": -8.66524887084961, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.927711486816406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5586593151092529, + "step": 1651, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988031387329102 + }, + { + "episode": 26448, + "epoch": 0.47539274544343385, + "loss/policy_avg": 0.8067106008529663, + "lr": 2.6832822085889573e-06, + "objective/entropy": 110.69371032714844, + "objective/kl": 17.34006118774414, + "objective/non_score_reward": -1.7340062856674194, + "objective/rlhf_reward": -6.536025023460388, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.030296325683594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.843829870223999, + "step": 1652, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000535726547241 + }, + { + "episode": 26464, + "epoch": 0.47568033936082255, + "loss/policy_avg": 0.4142029881477356, + "lr": 2.683090490797546e-06, + "objective/entropy": 182.6699676513672, + "objective/kl": 17.451894760131836, + "objective/non_score_reward": -1.7451895475387573, + "objective/rlhf_reward": -6.580758041143417, + "objective/scores": 0.1, + "policy/approxkl_avg": 92.45771789550781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4756516218185425, + "step": 1653, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000260353088379 + }, + { + "episode": 26480, + "epoch": 0.4759679332782112, + "loss/policy_avg": 0.33923524618148804, + "lr": 2.6828987730061353e-06, + "objective/entropy": -156.19644165039062, + "objective/kl": 20.641033172607422, + "objective/non_score_reward": -2.064103364944458, + "objective/rlhf_reward": -3.8564135193824765, + "objective/scores": 1.1, + "policy/approxkl_avg": 145.35055541992188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6929056644439697, + "step": 1654, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000690221786499 + }, + { + "episode": 26496, + "epoch": 0.4762555271955998, + "loss/policy_avg": 0.0668339729309082, + "lr": 2.682707055214724e-06, + "objective/entropy": 119.09803771972656, + "objective/kl": 16.56161117553711, + "objective/non_score_reward": -1.6561613082885742, + "objective/rlhf_reward": -2.224645113945007, + "objective/scores": 1.1, + "policy/approxkl_avg": 88.94144439697266, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7851405739784241, + "step": 1655, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987611770629883 + }, + { + "episode": 26512, + "epoch": 0.47654312111298847, + "loss/policy_avg": 0.4231521487236023, + "lr": 2.682515337423313e-06, + "objective/entropy": 198.75799560546875, + "objective/kl": 14.148205757141113, + "objective/non_score_reward": -1.4148205518722534, + "objective/rlhf_reward": -7.659282207489014, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.799705505371094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7525266408920288, + "step": 1656, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0012173652648926 + }, + { + "episode": 26528, + "epoch": 0.4768307150303771, + "loss/policy_avg": 0.37918904423713684, + "lr": 2.6823236196319018e-06, + "objective/entropy": 42.97578430175781, + "objective/kl": 13.691777229309082, + "objective/non_score_reward": -1.3691778182983398, + "objective/rlhf_reward": -7.476710796356201, + "objective/scores": -0.5, + "policy/approxkl_avg": 61.79126739501953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7020362615585327, + "step": 1657, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970271587371826 + }, + { + "episode": 26544, + "epoch": 0.47711830894776575, + "loss/policy_avg": 0.13542841374874115, + "lr": 2.6821319018404906e-06, + "objective/entropy": 26.955015182495117, + "objective/kl": 7.511106491088867, + "objective/non_score_reward": -0.7511106729507446, + "objective/rlhf_reward": 1.3955574050545696, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.479230880737305, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8626604080200195, + "step": 1658, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989900588989258 + }, + { + "episode": 26560, + "epoch": 0.4774059028651544, + "loss/policy_avg": 0.02864725887775421, + "lr": 2.68194018404908e-06, + "objective/entropy": 10.317928314208984, + "objective/kl": 15.39538288116455, + "objective/non_score_reward": -1.5395383834838867, + "objective/rlhf_reward": -4.496293788374053, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 14.582573890686035, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.60040283203125, + "step": 1659, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991271495819092 + }, + { + "episode": 26576, + "epoch": 0.477693496782543, + "loss/policy_avg": 0.2558051347732544, + "lr": 2.6817484662576686e-06, + "objective/entropy": 127.64970397949219, + "objective/kl": 14.231189727783203, + "objective/non_score_reward": -1.423119068145752, + "objective/rlhf_reward": -2.7687569602739543, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.797764778137207, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6011125445365906, + "step": 1660, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995901584625244 + }, + { + "episode": 26592, + "epoch": 0.4779810906999317, + "loss/policy_avg": -0.2182144820690155, + "lr": 2.681556748466258e-06, + "objective/entropy": 2.858919143676758, + "objective/kl": 7.278576850891113, + "objective/non_score_reward": -0.7278577089309692, + "objective/rlhf_reward": -4.911430835723877, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.9155890941619873, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.541759729385376, + "step": 1661, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0009493827819824 + }, + { + "episode": 26608, + "epoch": 0.47826868461732036, + "loss/policy_avg": 0.8636528253555298, + "lr": 2.6813650306748467e-06, + "objective/entropy": 140.26402282714844, + "objective/kl": 19.363082885742188, + "objective/non_score_reward": -1.9363083839416504, + "objective/rlhf_reward": -7.345234012603759, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.906473159790039, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7830764055252075, + "step": 1662, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0002543926239014 + }, + { + "episode": 26624, + "epoch": 0.478556278534709, + "loss/policy_avg": 0.09733457863330841, + "lr": 2.6811733128834355e-06, + "objective/entropy": 70.89553833007812, + "objective/kl": 16.588911056518555, + "objective/non_score_reward": -1.658891201019287, + "objective/rlhf_reward": -8.635564804077148, + "objective/scores": -0.5, + "policy/approxkl_avg": 129.03024291992188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7068811655044556, + "step": 1663, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9981067180633545 + }, + { + "episode": 26640, + "epoch": 0.47884387245209764, + "loss/policy_avg": 0.3696031868457794, + "lr": 2.6809815950920247e-06, + "objective/entropy": -80.44916534423828, + "objective/kl": 16.037981033325195, + "objective/non_score_reward": -1.6037981510162354, + "objective/rlhf_reward": -2.01519258916378, + "objective/scores": 1.1, + "policy/approxkl_avg": 122.46257019042969, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5848633050918579, + "step": 1664, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999934434890747 + }, + { + "episode": 26656, + "epoch": 0.4791314663694863, + "loss/policy_avg": 0.05235806852579117, + "lr": 2.6807898773006135e-06, + "objective/entropy": 279.45391845703125, + "objective/kl": 12.546588897705078, + "objective/non_score_reward": -1.2546589374542236, + "objective/rlhf_reward": -3.1938070013848057, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 45.496795654296875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7179256081581116, + "step": 1665, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999608039855957 + }, + { + "episode": 26672, + "epoch": 0.4794190602868749, + "loss/policy_avg": 0.2440473437309265, + "lr": 2.6805981595092023e-06, + "objective/entropy": 230.3323974609375, + "objective/kl": 18.421417236328125, + "objective/non_score_reward": -1.8421416282653809, + "objective/rlhf_reward": -6.968566811084747, + "objective/scores": 0.1, + "policy/approxkl_avg": 113.02169799804688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7931129932403564, + "step": 1666, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004630088806152 + }, + { + "episode": 26688, + "epoch": 0.47970665420426356, + "loss/policy_avg": 0.4596465826034546, + "lr": 2.6804064417177916e-06, + "objective/entropy": -190.0703887939453, + "objective/kl": 13.094562530517578, + "objective/non_score_reward": -1.3094563484191895, + "objective/rlhf_reward": -3.5044923583666483, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 18.68999671936035, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6400429010391235, + "step": 1667, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988033771514893 + }, + { + "episode": 26704, + "epoch": 0.4799942481216522, + "loss/policy_avg": 0.04195621609687805, + "lr": 2.6802147239263804e-06, + "objective/entropy": 54.4300537109375, + "objective/kl": 15.151268005371094, + "objective/non_score_reward": -1.5151269435882568, + "objective/rlhf_reward": -5.660508012771606, + "objective/scores": 0.1, + "policy/approxkl_avg": 70.76380920410156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6737993955612183, + "step": 1668, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995094537734985 + }, + { + "episode": 26720, + "epoch": 0.4802818420390409, + "loss/policy_avg": 0.8512704372406006, + "lr": 2.6800230061349696e-06, + "objective/entropy": 10.23702621459961, + "objective/kl": 16.816810607910156, + "objective/non_score_reward": -1.6816810369491577, + "objective/rlhf_reward": -3.8030053719293804, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.483461380004883, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7293576002120972, + "step": 1669, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989674091339111 + }, + { + "episode": 26736, + "epoch": 0.48056943595642954, + "loss/policy_avg": 0.09557580947875977, + "lr": 2.6798312883435584e-06, + "objective/entropy": 216.87448120117188, + "objective/kl": 14.237401962280273, + "objective/non_score_reward": -1.4237401485443115, + "objective/rlhf_reward": -1.2949605941772457, + "objective/scores": 1.1, + "policy/approxkl_avg": 11.813751220703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5397988557815552, + "step": 1670, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995009899139404 + }, + { + "episode": 26752, + "epoch": 0.4808570298738182, + "loss/policy_avg": 0.8583983182907104, + "lr": 2.6796395705521472e-06, + "objective/entropy": 325.3941650390625, + "objective/kl": 17.281902313232422, + "objective/non_score_reward": -1.7281899452209473, + "objective/rlhf_reward": -8.912759780883789, + "objective/scores": -0.5, + "policy/approxkl_avg": 68.82689666748047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8322206735610962, + "step": 1671, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000537395477295 + }, + { + "episode": 26768, + "epoch": 0.4811446237912068, + "loss/policy_avg": -0.03962177410721779, + "lr": 2.6794478527607365e-06, + "objective/entropy": 144.37335205078125, + "objective/kl": 16.263277053833008, + "objective/non_score_reward": -1.6263277530670166, + "objective/rlhf_reward": -4.105310714244842, + "objective/scores": 0.6, + "policy/approxkl_avg": 8.10549545288086, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6529961824417114, + "step": 1672, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979302883148193 + }, + { + "episode": 26784, + "epoch": 0.48143221770859546, + "loss/policy_avg": 0.8202505707740784, + "lr": 2.6792561349693253e-06, + "objective/entropy": 257.78668212890625, + "objective/kl": 18.381481170654297, + "objective/non_score_reward": -1.8381482362747192, + "objective/rlhf_reward": -5.40518171616071, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.968506813049316, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6601094603538513, + "step": 1673, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014734268188477 + }, + { + "episode": 26800, + "epoch": 0.4817198116259841, + "loss/policy_avg": 0.2722570300102234, + "lr": 2.679064417177914e-06, + "objective/entropy": 159.26776123046875, + "objective/kl": 23.311752319335938, + "objective/non_score_reward": -2.3311753273010254, + "objective/rlhf_reward": -4.9247020244598385, + "objective/scores": 1.1, + "policy/approxkl_avg": 23.478086471557617, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6764966249465942, + "step": 1674, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992938041687012 + }, + { + "episode": 26816, + "epoch": 0.48200740554337274, + "loss/policy_avg": 1.5214738845825195, + "lr": 2.6788726993865033e-06, + "objective/entropy": 53.97813415527344, + "objective/kl": 11.97580337524414, + "objective/non_score_reward": -1.197580337524414, + "objective/rlhf_reward": -4.390321350097656, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.5720791816711426, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5206632614135742, + "step": 1675, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003241777420044 + }, + { + "episode": 26832, + "epoch": 0.48229499946076143, + "loss/policy_avg": 0.0932961255311966, + "lr": 2.678680981595092e-06, + "objective/entropy": -24.687271118164062, + "objective/kl": 15.590835571289062, + "objective/non_score_reward": -1.5590835809707642, + "objective/rlhf_reward": -3.8363342046737667, + "objective/scores": 0.6, + "policy/approxkl_avg": 31.776020050048828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6759655475616455, + "step": 1676, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985594749450684 + }, + { + "episode": 26848, + "epoch": 0.48258259337815007, + "loss/policy_avg": 0.1449345350265503, + "lr": 2.6784892638036813e-06, + "objective/entropy": -87.6327133178711, + "objective/kl": 12.795013427734375, + "objective/non_score_reward": -1.2795013189315796, + "objective/rlhf_reward": -3.170593927578862, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 17.413463592529297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7654553055763245, + "step": 1677, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999047040939331 + }, + { + "episode": 26864, + "epoch": 0.4828701872955387, + "loss/policy_avg": 0.39904987812042236, + "lr": 2.67829754601227e-06, + "objective/entropy": -42.12534713745117, + "objective/kl": 15.66563606262207, + "objective/non_score_reward": -1.5665637254714966, + "objective/rlhf_reward": -8.266254425048828, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.999363899230957, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5445783138275146, + "step": 1678, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977316856384277 + }, + { + "episode": 26880, + "epoch": 0.48315778121292735, + "loss/policy_avg": 0.033817049115896225, + "lr": 2.678105828220859e-06, + "objective/entropy": 347.9168701171875, + "objective/kl": 13.055137634277344, + "objective/non_score_reward": -1.305513858795166, + "objective/rlhf_reward": -3.6657959361940176, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 37.63822555541992, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8259381055831909, + "step": 1679, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978172779083252 + }, + { + "episode": 26896, + "epoch": 0.483445375130316, + "loss/policy_avg": 0.09545890986919403, + "lr": 2.6779141104294478e-06, + "objective/entropy": 118.7197494506836, + "objective/kl": 12.683245658874512, + "objective/non_score_reward": -1.268324613571167, + "objective/rlhf_reward": -2.149579022766325, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 16.969579696655273, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5533679723739624, + "step": 1680, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996938705444336 + }, + { + "episode": 26912, + "epoch": 0.48373296904770463, + "loss/policy_avg": 0.3336024284362793, + "lr": 2.6777223926380366e-06, + "objective/entropy": 150.80264282226562, + "objective/kl": 17.18734359741211, + "objective/non_score_reward": -1.7187345027923584, + "objective/rlhf_reward": -6.474938011169433, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.541332244873047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.647294819355011, + "step": 1681, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9961645603179932 + }, + { + "episode": 26928, + "epoch": 0.48402056296509327, + "loss/policy_avg": 0.11377117037773132, + "lr": 2.677530674846626e-06, + "objective/entropy": 260.295654296875, + "objective/kl": 19.10438346862793, + "objective/non_score_reward": -1.9104382991790771, + "objective/rlhf_reward": -5.979893451154815, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 70.76204681396484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796458959579468, + "step": 1682, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962824583053589 + }, + { + "episode": 26944, + "epoch": 0.4843081568824819, + "loss/policy_avg": 0.5521122217178345, + "lr": 2.6773389570552146e-06, + "objective/entropy": 172.6796417236328, + "objective/kl": 17.577075958251953, + "objective/non_score_reward": -1.7577075958251953, + "objective/rlhf_reward": -4.1071117415439815, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 23.999298095703125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6038773059844971, + "step": 1683, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993399381637573 + }, + { + "episode": 26960, + "epoch": 0.4845957507998706, + "loss/policy_avg": 0.6839799880981445, + "lr": 2.677147239263804e-06, + "objective/entropy": -137.83380126953125, + "objective/kl": 18.686176300048828, + "objective/non_score_reward": -1.8686177730560303, + "objective/rlhf_reward": -3.0744709134101864, + "objective/scores": 1.1, + "policy/approxkl_avg": 51.80305099487305, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.39423924684524536, + "step": 1684, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998917818069458 + }, + { + "episode": 26976, + "epoch": 0.48488334471725925, + "loss/policy_avg": 0.10252314805984497, + "lr": 2.6769555214723927e-06, + "objective/entropy": 50.94281005859375, + "objective/kl": 15.866674423217773, + "objective/non_score_reward": -1.586667537689209, + "objective/rlhf_reward": -5.946670508384704, + "objective/scores": 0.1, + "policy/approxkl_avg": 92.38317108154297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6353904008865356, + "step": 1685, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0007472038269043 + }, + { + "episode": 26992, + "epoch": 0.4851709386346479, + "loss/policy_avg": 0.5134760737419128, + "lr": 2.6767638036809815e-06, + "objective/entropy": 75.39920806884766, + "objective/kl": 16.352144241333008, + "objective/non_score_reward": -1.6352144479751587, + "objective/rlhf_reward": -8.540857315063477, + "objective/scores": -0.5, + "policy/approxkl_avg": 69.13995361328125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7286012172698975, + "step": 1686, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9959840774536133 + }, + { + "episode": 27008, + "epoch": 0.4854585325520365, + "loss/policy_avg": -0.531006395816803, + "lr": 2.6765720858895707e-06, + "objective/entropy": 145.20367431640625, + "objective/kl": 10.78251838684082, + "objective/non_score_reward": -1.078251838684082, + "objective/rlhf_reward": -6.313007354736328, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.25226974487305, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5566188097000122, + "step": 1687, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006327629089355 + }, + { + "episode": 27024, + "epoch": 0.48574612646942517, + "loss/policy_avg": -0.807724118232727, + "lr": 2.6763803680981595e-06, + "objective/entropy": -169.50186157226562, + "objective/kl": 18.087905883789062, + "objective/non_score_reward": -1.8087905645370483, + "objective/rlhf_reward": -6.8351626157760625, + "objective/scores": 0.1, + "policy/approxkl_avg": 62.57245635986328, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.668152928352356, + "step": 1688, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0666861534118652 + }, + { + "episode": 27040, + "epoch": 0.4860337203868138, + "loss/policy_avg": 0.06836674362421036, + "lr": 2.6761886503067483e-06, + "objective/entropy": 90.70515441894531, + "objective/kl": 14.302078247070312, + "objective/non_score_reward": -1.4302079677581787, + "objective/rlhf_reward": -1.3208320423960682, + "objective/scores": 1.1, + "policy/approxkl_avg": 45.73702621459961, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41737520694732666, + "step": 1689, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990456104278564 + }, + { + "episode": 27056, + "epoch": 0.48632131430420245, + "loss/policy_avg": 2.2695631980895996, + "lr": 2.6759969325153376e-06, + "objective/entropy": -119.82723999023438, + "objective/kl": 14.979490280151367, + "objective/non_score_reward": -1.4979491233825684, + "objective/rlhf_reward": -4.258463070789972, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 169.4292755126953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.44966164231300354, + "step": 1690, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990613460540771 + }, + { + "episode": 27072, + "epoch": 0.48660890822159114, + "loss/policy_avg": 0.25318387150764465, + "lr": 2.6758052147239264e-06, + "objective/entropy": -54.465782165527344, + "objective/kl": 10.187479019165039, + "objective/non_score_reward": -1.0187478065490723, + "objective/rlhf_reward": -6.074991226196289, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.81450271606445, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6709445714950562, + "step": 1691, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9990923404693604 + }, + { + "episode": 27088, + "epoch": 0.4868965021389798, + "loss/policy_avg": 0.5283163189888, + "lr": 2.6756134969325156e-06, + "objective/entropy": 32.93486785888672, + "objective/kl": 12.04556655883789, + "objective/non_score_reward": -1.2045565843582153, + "objective/rlhf_reward": -4.4182263821363446, + "objective/scores": 0.1, + "policy/approxkl_avg": 46.15248107910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7208871841430664, + "step": 1692, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000389337539673 + }, + { + "episode": 27104, + "epoch": 0.4871840960563684, + "loss/policy_avg": -0.20891764760017395, + "lr": 2.6754217791411044e-06, + "objective/entropy": -61.90003967285156, + "objective/kl": 10.981453895568848, + "objective/non_score_reward": -1.0981454849243164, + "objective/rlhf_reward": -1.4688628062021463, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 35.74761199951172, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6324474215507507, + "step": 1693, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0004220008850098 + }, + { + "episode": 27120, + "epoch": 0.48747168997375706, + "loss/policy_avg": 0.4535183906555176, + "lr": 2.6752300613496932e-06, + "objective/entropy": 197.14102172851562, + "objective/kl": 11.678915977478027, + "objective/non_score_reward": -1.1678917407989502, + "objective/rlhf_reward": -0.27156662046909297, + "objective/scores": 1.1, + "policy/approxkl_avg": 10.70810317993164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5310260057449341, + "step": 1694, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988822937011719 + }, + { + "episode": 27136, + "epoch": 0.4877592838911457, + "loss/policy_avg": 0.16736939549446106, + "lr": 2.6750383435582825e-06, + "objective/entropy": 57.92140579223633, + "objective/kl": 17.87525177001953, + "objective/non_score_reward": -1.7875254154205322, + "objective/rlhf_reward": -5.545981440607624, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 77.61048889160156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8444628715515137, + "step": 1695, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9988882541656494 + }, + { + "episode": 27152, + "epoch": 0.48804687780853434, + "loss/policy_avg": 0.05939989909529686, + "lr": 2.6748466257668713e-06, + "objective/entropy": -58.93162536621094, + "objective/kl": 21.960491180419922, + "objective/non_score_reward": -2.196049213409424, + "objective/rlhf_reward": -4.384197330474853, + "objective/scores": 1.1, + "policy/approxkl_avg": 82.17927551269531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6906427145004272, + "step": 1696, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995031356811523 + }, + { + "episode": 27168, + "epoch": 0.488334471725923, + "loss/policy_avg": 0.2913649082183838, + "lr": 2.6746549079754605e-06, + "objective/entropy": 81.17434692382812, + "objective/kl": 15.473855018615723, + "objective/non_score_reward": -1.5473856925964355, + "objective/rlhf_reward": -5.789542427659034, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.416595458984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7009164094924927, + "step": 1697, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989409446716309 + }, + { + "episode": 27184, + "epoch": 0.4886220656433116, + "loss/policy_avg": 0.2651277780532837, + "lr": 2.6744631901840493e-06, + "objective/entropy": 58.803428649902344, + "objective/kl": 18.283138275146484, + "objective/non_score_reward": -1.8283135890960693, + "objective/rlhf_reward": -4.389535520912382, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 105.51332092285156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5497996211051941, + "step": 1698, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996532678604126 + }, + { + "episode": 27200, + "epoch": 0.4889096595607003, + "loss/policy_avg": 0.6435315608978271, + "lr": 2.674271472392638e-06, + "objective/entropy": -44.15504455566406, + "objective/kl": 15.604392051696777, + "objective/non_score_reward": -1.560438871383667, + "objective/rlhf_reward": -1.8417557053267952, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.01034545898438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5064129829406738, + "step": 1699, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977962970733643 + }, + { + "episode": 27216, + "epoch": 0.48919725347808896, + "loss/policy_avg": 0.5258274674415588, + "lr": 2.6740797546012274e-06, + "objective/entropy": 13.031089782714844, + "objective/kl": 7.618081092834473, + "objective/non_score_reward": -0.7618080377578735, + "objective/rlhf_reward": -0.12351331555959844, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.073127746582031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6002603769302368, + "step": 1700, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9963665008544922 + }, + { + "episode": 27232, + "epoch": 0.4894848473954776, + "loss/policy_avg": 0.5734251141548157, + "lr": 2.6738880368098157e-06, + "objective/entropy": 151.17642211914062, + "objective/kl": 22.45712661743164, + "objective/non_score_reward": -2.245712995529175, + "objective/rlhf_reward": -10.9828519821167, + "objective/scores": -0.5, + "policy/approxkl_avg": 80.04830932617188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8935544490814209, + "step": 1701, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976305961608887 + }, + { + "episode": 27248, + "epoch": 0.48977244131286624, + "loss/policy_avg": 0.18563227355480194, + "lr": 2.673696319018405e-06, + "objective/entropy": 71.73806762695312, + "objective/kl": 10.811704635620117, + "objective/non_score_reward": -1.0811705589294434, + "objective/rlhf_reward": -3.924682086706161, + "objective/scores": 0.1, + "policy/approxkl_avg": 68.60709381103516, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7594705820083618, + "step": 1702, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997520446777344 + }, + { + "episode": 27264, + "epoch": 0.4900600352302549, + "loss/policy_avg": 0.010921552777290344, + "lr": 2.6735046012269938e-06, + "objective/entropy": 312.49151611328125, + "objective/kl": 19.373149871826172, + "objective/non_score_reward": -1.9373149871826172, + "objective/rlhf_reward": -3.3492601871490475, + "objective/scores": 1.1, + "policy/approxkl_avg": 65.90713500976562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7818717956542969, + "step": 1703, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997588872909546 + }, + { + "episode": 27280, + "epoch": 0.4903476291476435, + "loss/policy_avg": 0.2919304668903351, + "lr": 2.6733128834355826e-06, + "objective/entropy": 100.27139282226562, + "objective/kl": 10.23652458190918, + "objective/non_score_reward": -1.0236525535583496, + "objective/rlhf_reward": 0.30538999438285863, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.8728735446929932, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5916879177093506, + "step": 1704, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0014801025390625 + }, + { + "episode": 27296, + "epoch": 0.49063522306503216, + "loss/policy_avg": 0.14356620609760284, + "lr": 2.673121165644172e-06, + "objective/entropy": 242.97398376464844, + "objective/kl": 16.659366607666016, + "objective/non_score_reward": -1.6659367084503174, + "objective/rlhf_reward": -8.66374683380127, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.75727081298828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6769480109214783, + "step": 1705, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984476566314697 + }, + { + "episode": 27312, + "epoch": 0.4909228169824208, + "loss/policy_avg": 0.2744619846343994, + "lr": 2.6729294478527606e-06, + "objective/entropy": -69.02651977539062, + "objective/kl": 17.554481506347656, + "objective/non_score_reward": -1.7554482221603394, + "objective/rlhf_reward": -9.0217924118042, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.135658264160156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5950236320495605, + "step": 1706, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9994828701019287 + }, + { + "episode": 27328, + "epoch": 0.4912104108998095, + "loss/policy_avg": 0.3935040235519409, + "lr": 2.67273773006135e-06, + "objective/entropy": 196.258544921875, + "objective/kl": 4.807914733886719, + "objective/non_score_reward": -0.48079144954681396, + "objective/rlhf_reward": 2.4768340677022938, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.6954267024993896, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.75687575340271, + "step": 1707, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0042829513549805 + }, + { + "episode": 27344, + "epoch": 0.49149800481719813, + "loss/policy_avg": 0.09713247418403625, + "lr": 2.6725460122699387e-06, + "objective/entropy": -250.28390502929688, + "objective/kl": 20.579936981201172, + "objective/non_score_reward": -2.0579938888549805, + "objective/rlhf_reward": -7.831975115835666, + "objective/scores": 0.1, + "policy/approxkl_avg": 49.66277313232422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7922552824020386, + "step": 1708, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9978537559509277 + }, + { + "episode": 27360, + "epoch": 0.49178559873458677, + "loss/policy_avg": 1.3034917116165161, + "lr": 2.6723542944785275e-06, + "objective/entropy": 293.18182373046875, + "objective/kl": 13.69601821899414, + "objective/non_score_reward": -1.3696017265319824, + "objective/rlhf_reward": -1.0784070849418637, + "objective/scores": 1.1, + "policy/approxkl_avg": 44.882728576660156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7728196382522583, + "step": 1709, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976916313171387 + }, + { + "episode": 27376, + "epoch": 0.4920731926519754, + "loss/policy_avg": 0.6501213312149048, + "lr": 2.6721625766871167e-06, + "objective/entropy": 26.59703826904297, + "objective/kl": 17.755245208740234, + "objective/non_score_reward": -1.7755244970321655, + "objective/rlhf_reward": -9.102097511291504, + "objective/scores": -0.5, + "policy/approxkl_avg": 62.82991027832031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9298363924026489, + "step": 1710, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0004334449768066 + }, + { + "episode": 27392, + "epoch": 0.49236078656936405, + "loss/policy_avg": 0.5077639222145081, + "lr": 2.6719708588957055e-06, + "objective/entropy": 161.69691467285156, + "objective/kl": 21.700233459472656, + "objective/non_score_reward": -2.1700234413146973, + "objective/rlhf_reward": -5.756374274135801, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 53.48500061035156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7480050325393677, + "step": 1711, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9954512119293213 + }, + { + "episode": 27408, + "epoch": 0.4926483804867527, + "loss/policy_avg": 0.42052412033081055, + "lr": 2.6717791411042948e-06, + "objective/entropy": -36.82831954956055, + "objective/kl": 11.032814979553223, + "objective/non_score_reward": -1.1032816171646118, + "objective/rlhf_reward": -6.413126468658447, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.36772537231445, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5636872053146362, + "step": 1712, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998363733291626 + }, + { + "episode": 27424, + "epoch": 0.49293597440414133, + "loss/policy_avg": 1.5223966836929321, + "lr": 2.6715874233128836e-06, + "objective/entropy": 125.52906799316406, + "objective/kl": 9.578523635864258, + "objective/non_score_reward": -0.9578523635864258, + "objective/rlhf_reward": -1.7087034008660653, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 11.727434158325195, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.28627026081085205, + "step": 1713, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001208543777466 + }, + { + "episode": 27440, + "epoch": 0.49322356832153, + "loss/policy_avg": 0.16528268158435822, + "lr": 2.6713957055214724e-06, + "objective/entropy": -198.51060485839844, + "objective/kl": 11.166755676269531, + "objective/non_score_reward": -1.116675615310669, + "objective/rlhf_reward": -0.06670258045196498, + "objective/scores": 1.1, + "policy/approxkl_avg": 48.924591064453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4975529909133911, + "step": 1714, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997665286064148 + }, + { + "episode": 27456, + "epoch": 0.49351116223891867, + "loss/policy_avg": -0.17147034406661987, + "lr": 2.6712039877300616e-06, + "objective/entropy": 172.23587036132812, + "objective/kl": 13.310028076171875, + "objective/non_score_reward": -1.3310028314590454, + "objective/rlhf_reward": -3.7677519535094053, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.9677109718322754, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7124722003936768, + "step": 1715, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003903865814209 + }, + { + "episode": 27472, + "epoch": 0.4937987561563073, + "loss/policy_avg": 0.05405956506729126, + "lr": 2.6710122699386504e-06, + "objective/entropy": -11.503082275390625, + "objective/kl": 17.24118995666504, + "objective/non_score_reward": -1.724118947982788, + "objective/rlhf_reward": -6.496476149559021, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.422582626342773, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7170135974884033, + "step": 1716, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000546455383301 + }, + { + "episode": 27488, + "epoch": 0.49408635007369595, + "loss/policy_avg": 0.5887688398361206, + "lr": 2.6708205521472392e-06, + "objective/entropy": 17.882118225097656, + "objective/kl": 19.30318260192871, + "objective/non_score_reward": -1.9303183555603027, + "objective/rlhf_reward": -4.797554348350737, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 135.39852905273438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6600601077079773, + "step": 1717, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995791912078857 + }, + { + "episode": 27504, + "epoch": 0.4943739439910846, + "loss/policy_avg": -0.27067333459854126, + "lr": 2.6706288343558285e-06, + "objective/entropy": 125.53389739990234, + "objective/kl": 16.03993797302246, + "objective/non_score_reward": -1.6039938926696777, + "objective/rlhf_reward": -8.415975570678711, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.563714027404785, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.48206979036331177, + "step": 1718, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0037009716033936 + }, + { + "episode": 27520, + "epoch": 0.4946615379084732, + "loss/policy_avg": 0.017645031213760376, + "lr": 2.6704371165644173e-06, + "objective/entropy": 232.59396362304688, + "objective/kl": 15.594569206237793, + "objective/non_score_reward": -1.5594568252563477, + "objective/rlhf_reward": -8.23782730102539, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.77301025390625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8485502004623413, + "step": 1719, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0005722045898438 + }, + { + "episode": 27536, + "epoch": 0.49494913182586187, + "loss/policy_avg": 0.6044291257858276, + "lr": 2.6702453987730065e-06, + "objective/entropy": 112.50225067138672, + "objective/kl": 15.722814559936523, + "objective/non_score_reward": -1.5722814798355103, + "objective/rlhf_reward": -4.341715033130582, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 4.219062805175781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5630087852478027, + "step": 1720, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984500408172607 + }, + { + "episode": 27552, + "epoch": 0.4952367257432505, + "loss/policy_avg": 0.33518537878990173, + "lr": 2.6700536809815953e-06, + "objective/entropy": 158.0392608642578, + "objective/kl": 19.25482177734375, + "objective/non_score_reward": -1.9254825115203857, + "objective/rlhf_reward": -7.301930522918701, + "objective/scores": 0.1, + "policy/approxkl_avg": 105.86214447021484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6998374462127686, + "step": 1721, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984279870986938 + }, + { + "episode": 27568, + "epoch": 0.4955243196606392, + "loss/policy_avg": 0.09444519132375717, + "lr": 2.669861963190184e-06, + "objective/entropy": 187.67007446289062, + "objective/kl": 19.470523834228516, + "objective/non_score_reward": -1.9470525979995728, + "objective/rlhf_reward": -9.788209915161133, + "objective/scores": -0.5, + "policy/approxkl_avg": 110.34794616699219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7890366315841675, + "step": 1722, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979407787322998 + }, + { + "episode": 27584, + "epoch": 0.49581191357802784, + "loss/policy_avg": 0.9407260417938232, + "lr": 2.669670245398773e-06, + "objective/entropy": -31.73898696899414, + "objective/kl": 14.171585083007812, + "objective/non_score_reward": -1.4171584844589233, + "objective/rlhf_reward": -7.668633937835693, + "objective/scores": -0.5, + "policy/approxkl_avg": 55.22858428955078, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7059850692749023, + "step": 1723, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999644756317139 + }, + { + "episode": 27600, + "epoch": 0.4960995074954165, + "loss/policy_avg": 0.5673115253448486, + "lr": 2.6694785276073617e-06, + "objective/entropy": 2.834369659423828, + "objective/kl": 12.505474090576172, + "objective/non_score_reward": -1.2505474090576172, + "objective/rlhf_reward": -2.0784708007585735, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 35.140323638916016, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5003098845481873, + "step": 1724, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997611045837402 + }, + { + "episode": 27616, + "epoch": 0.4963871014128051, + "loss/policy_avg": 0.37552765011787415, + "lr": 2.669286809815951e-06, + "objective/entropy": 48.47637939453125, + "objective/kl": 14.658306121826172, + "objective/non_score_reward": -1.4658305644989014, + "objective/rlhf_reward": -5.463322436809539, + "objective/scores": 0.1, + "policy/approxkl_avg": 78.75804138183594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7523536682128906, + "step": 1725, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979631900787354 + }, + { + "episode": 27632, + "epoch": 0.49667469533019376, + "loss/policy_avg": 0.27567678689956665, + "lr": 2.66909509202454e-06, + "objective/entropy": 111.3084487915039, + "objective/kl": 20.33347511291504, + "objective/non_score_reward": -2.0333473682403564, + "objective/rlhf_reward": -7.7333897113800045, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.45749855041504, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5236789584159851, + "step": 1726, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998615980148315 + }, + { + "episode": 27648, + "epoch": 0.4969622892475824, + "loss/policy_avg": 0.4971972107887268, + "lr": 2.6689033742331286e-06, + "objective/entropy": -37.139991760253906, + "objective/kl": 11.294246673583984, + "objective/non_score_reward": -1.1294245719909668, + "objective/rlhf_reward": -6.517698287963867, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.072978973388672, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7589508295059204, + "step": 1727, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0010251998901367 + }, + { + "episode": 27664, + "epoch": 0.49724988316497104, + "loss/policy_avg": 1.9962494373321533, + "lr": 2.668711656441718e-06, + "objective/entropy": 120.74639892578125, + "objective/kl": 20.389034271240234, + "objective/non_score_reward": -2.03890323638916, + "objective/rlhf_reward": -10.15561294555664, + "objective/scores": -0.5, + "policy/approxkl_avg": 152.50051879882812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5712071657180786, + "step": 1728, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977648258209229 + }, + { + "episode": 27680, + "epoch": 0.4975374770823597, + "loss/policy_avg": 0.04471936449408531, + "lr": 2.6685199386503066e-06, + "objective/entropy": 172.6311492919922, + "objective/kl": 17.905120849609375, + "objective/non_score_reward": -1.7905118465423584, + "objective/rlhf_reward": -6.762047624588012, + "objective/scores": 0.1, + "policy/approxkl_avg": 77.14485168457031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46560442447662354, + "step": 1729, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997314453125 + }, + { + "episode": 27696, + "epoch": 0.4978250709997484, + "loss/policy_avg": 0.12098614126443863, + "lr": 2.668328220858896e-06, + "objective/entropy": 61.26271438598633, + "objective/kl": 14.224048614501953, + "objective/non_score_reward": -1.4224047660827637, + "objective/rlhf_reward": -4.027759557188141, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 41.26526641845703, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7243987917900085, + "step": 1730, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998290538787842 + }, + { + "episode": 27712, + "epoch": 0.498112664917137, + "loss/policy_avg": 0.09704755246639252, + "lr": 2.6681365030674847e-06, + "objective/entropy": 226.2583465576172, + "objective/kl": 13.247810363769531, + "objective/non_score_reward": -1.3247809410095215, + "objective/rlhf_reward": -2.3754050775778026, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 14.467374801635742, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5639190673828125, + "step": 1731, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99894380569458 + }, + { + "episode": 27728, + "epoch": 0.49840025883452566, + "loss/policy_avg": 0.22803986072540283, + "lr": 2.6679447852760735e-06, + "objective/entropy": 24.980182647705078, + "objective/kl": 9.997011184692383, + "objective/non_score_reward": -0.9997010827064514, + "objective/rlhf_reward": -1.8760980687299111, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 48.40413284301758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4330744743347168, + "step": 1732, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993398189544678 + }, + { + "episode": 27744, + "epoch": 0.4986878527519143, + "loss/policy_avg": 0.2788919508457184, + "lr": 2.6677530674846627e-06, + "objective/entropy": -17.63727569580078, + "objective/kl": 16.732961654663086, + "objective/non_score_reward": -1.6732960939407349, + "objective/rlhf_reward": -6.293184316158294, + "objective/scores": 0.1, + "policy/approxkl_avg": 65.41502380371094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.40989822149276733, + "step": 1733, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983696937561035 + }, + { + "episode": 27760, + "epoch": 0.49897544666930294, + "loss/policy_avg": 0.13127373158931732, + "lr": 2.6675613496932515e-06, + "objective/entropy": 123.1053237915039, + "objective/kl": 17.913818359375, + "objective/non_score_reward": -1.7913819551467896, + "objective/rlhf_reward": -6.7655277907848355, + "objective/scores": 0.1, + "policy/approxkl_avg": 37.40669631958008, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.48393747210502625, + "step": 1734, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971840381622314 + }, + { + "episode": 27776, + "epoch": 0.4992630405866916, + "loss/policy_avg": 0.1614466905593872, + "lr": 2.6673696319018408e-06, + "objective/entropy": 70.39647674560547, + "objective/kl": 15.208030700683594, + "objective/non_score_reward": -1.5208032131195068, + "objective/rlhf_reward": -5.683212614059448, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.147375106811523, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6510974764823914, + "step": 1735, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979395866394043 + }, + { + "episode": 27792, + "epoch": 0.4995506345040802, + "loss/policy_avg": 0.19504263997077942, + "lr": 2.6671779141104296e-06, + "objective/entropy": 24.889122009277344, + "objective/kl": 18.118755340576172, + "objective/non_score_reward": -1.811875581741333, + "objective/rlhf_reward": -5.422673340114663, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 11.235595703125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7391984462738037, + "step": 1736, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0010764598846436 + }, + { + "episode": 27808, + "epoch": 0.4998382284214689, + "loss/policy_avg": -0.01884336769580841, + "lr": 2.6669861963190184e-06, + "objective/entropy": -1.0885772705078125, + "objective/kl": 13.130938529968262, + "objective/non_score_reward": -1.313093900680542, + "objective/rlhf_reward": -2.328656946064207, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 10.169123649597168, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5120453238487244, + "step": 1737, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0012874603271484 + }, + { + "episode": 27824, + "epoch": 0.5001258223388575, + "loss/policy_avg": 0.19150349497795105, + "lr": 2.6667944785276076e-06, + "objective/entropy": -21.409439086914062, + "objective/kl": 14.654093742370605, + "objective/non_score_reward": -1.465409517288208, + "objective/rlhf_reward": -3.9142266166972473, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 33.64068603515625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5209637880325317, + "step": 1738, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000447988510132 + }, + { + "episode": 27840, + "epoch": 0.5004134162562461, + "loss/policy_avg": 0.4384639859199524, + "lr": 2.6666027607361964e-06, + "objective/entropy": -306.7899169921875, + "objective/kl": 16.510332107543945, + "objective/non_score_reward": -1.6510331630706787, + "objective/rlhf_reward": -5.000012729231434, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 38.28997039794922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5968725681304932, + "step": 1739, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9977682828903198 + }, + { + "episode": 27856, + "epoch": 0.5007010101736348, + "loss/policy_avg": 0.30068957805633545, + "lr": 2.6664110429447852e-06, + "objective/entropy": -86.3069839477539, + "objective/kl": 19.811098098754883, + "objective/non_score_reward": -1.9811099767684937, + "objective/rlhf_reward": -5.524439787864685, + "objective/scores": 0.6, + "policy/approxkl_avg": 37.62815856933594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7373462915420532, + "step": 1740, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983165264129639 + }, + { + "episode": 27872, + "epoch": 0.5009886040910235, + "loss/policy_avg": 0.17040492594242096, + "lr": 2.6662193251533745e-06, + "objective/entropy": 156.28121948242188, + "objective/kl": 11.497791290283203, + "objective/non_score_reward": -1.1497790813446045, + "objective/rlhf_reward": -2.1991163849830624, + "objective/scores": 0.6, + "policy/approxkl_avg": 7.689607620239258, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.38815832138061523, + "step": 1741, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002171516418457 + }, + { + "episode": 27888, + "epoch": 0.5012761980084122, + "loss/policy_avg": 0.06310490518808365, + "lr": 2.6660276073619633e-06, + "objective/entropy": 142.2759246826172, + "objective/kl": 22.004135131835938, + "objective/non_score_reward": -2.2004141807556152, + "objective/rlhf_reward": -4.401655888557434, + "objective/scores": 1.1, + "policy/approxkl_avg": 36.01221466064453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6704832315444946, + "step": 1742, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99953293800354 + }, + { + "episode": 27904, + "epoch": 0.5015637919258008, + "loss/policy_avg": -0.008167348802089691, + "lr": 2.6658358895705525e-06, + "objective/entropy": 140.29356384277344, + "objective/kl": 16.384765625, + "objective/non_score_reward": -1.6384766101837158, + "objective/rlhf_reward": -6.153906142711639, + "objective/scores": 0.1, + "policy/approxkl_avg": 38.04493713378906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.802377462387085, + "step": 1743, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984660148620605 + }, + { + "episode": 27920, + "epoch": 0.5018513858431894, + "loss/policy_avg": 0.3492729067802429, + "lr": 2.6656441717791413e-06, + "objective/entropy": 30.628379821777344, + "objective/kl": 12.58380126953125, + "objective/non_score_reward": -1.2583800554275513, + "objective/rlhf_reward": -2.6335202217102047, + "objective/scores": 0.6, + "policy/approxkl_avg": 9.596057891845703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.50040602684021, + "step": 1744, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998366117477417 + }, + { + "episode": 27936, + "epoch": 0.5021389797605781, + "loss/policy_avg": 0.5450857281684875, + "lr": 2.66545245398773e-06, + "objective/entropy": 18.2037296295166, + "objective/kl": 18.23172378540039, + "objective/non_score_reward": -1.8231723308563232, + "objective/rlhf_reward": -5.467860485586236, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 52.9035530090332, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5289323329925537, + "step": 1745, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986926317214966 + }, + { + "episode": 27952, + "epoch": 0.5024265736779667, + "loss/policy_avg": 0.11652082949876785, + "lr": 2.665260736196319e-06, + "objective/entropy": 336.8443603515625, + "objective/kl": 15.785937309265137, + "objective/non_score_reward": -1.5785937309265137, + "objective/rlhf_reward": -8.314374923706055, + "objective/scores": -0.5, + "policy/approxkl_avg": 112.4112548828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8711972832679749, + "step": 1746, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9983396530151367 + }, + { + "episode": 27968, + "epoch": 0.5027141675953554, + "loss/policy_avg": 0.02146860957145691, + "lr": 2.6650690184049078e-06, + "objective/entropy": 202.36102294921875, + "objective/kl": 13.70978832244873, + "objective/non_score_reward": -1.370978832244873, + "objective/rlhf_reward": -7.483915328979492, + "objective/scores": -0.5, + "policy/approxkl_avg": 171.70550537109375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.80576491355896, + "step": 1747, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973173141479492 + }, + { + "episode": 27984, + "epoch": 0.503001761512744, + "loss/policy_avg": 0.46403220295906067, + "lr": 2.664877300613497e-06, + "objective/entropy": 336.539794921875, + "objective/kl": 12.597597122192383, + "objective/non_score_reward": -1.2597599029541016, + "objective/rlhf_reward": -0.6390396714210507, + "objective/scores": 1.1, + "policy/approxkl_avg": 34.25482940673828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.808406412601471, + "step": 1748, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988746643066406 + }, + { + "episode": 28000, + "epoch": 0.5032893554301326, + "loss/policy_avg": 0.746623158454895, + "lr": 2.664685582822086e-06, + "objective/entropy": 236.44168090820312, + "objective/kl": 13.086444854736328, + "objective/non_score_reward": -1.3086445331573486, + "objective/rlhf_reward": -2.83457795381546, + "objective/scores": 0.6, + "policy/approxkl_avg": 8.036091804504395, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5963040590286255, + "step": 1749, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990828037261963 + }, + { + "episode": 28016, + "epoch": 0.5035769493475213, + "loss/policy_avg": 0.07168757915496826, + "lr": 2.664493865030675e-06, + "objective/entropy": 71.59014892578125, + "objective/kl": 18.932260513305664, + "objective/non_score_reward": -1.8932262659072876, + "objective/rlhf_reward": -5.450198697225128, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 51.12376403808594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3951878249645233, + "step": 1750, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9968023300170898 + }, + { + "episode": 28032, + "epoch": 0.5038645432649099, + "loss/policy_avg": -0.30793046951293945, + "lr": 2.664302147239264e-06, + "objective/entropy": 42.83002853393555, + "objective/kl": 10.219228744506836, + "objective/non_score_reward": -1.0219228267669678, + "objective/rlhf_reward": -6.087691307067871, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.549211502075195, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6399141550064087, + "step": 1751, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00927734375 + }, + { + "episode": 28048, + "epoch": 0.5041521371822986, + "loss/policy_avg": 0.32605302333831787, + "lr": 2.6641104294478526e-06, + "objective/entropy": -77.07260131835938, + "objective/kl": 11.111099243164062, + "objective/non_score_reward": -1.111109972000122, + "objective/rlhf_reward": -4.0444394707679745, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.713918685913086, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5597299337387085, + "step": 1752, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988629817962646 + }, + { + "episode": 28064, + "epoch": 0.5044397310996872, + "loss/policy_avg": -0.13548192381858826, + "lr": 2.663918711656442e-06, + "objective/entropy": -74.50413513183594, + "objective/kl": 11.32368278503418, + "objective/non_score_reward": -1.1323683261871338, + "objective/rlhf_reward": -1.6057543500673501, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 44.30047607421875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6872704029083252, + "step": 1753, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0003886222839355 + }, + { + "episode": 28080, + "epoch": 0.5047273250170758, + "loss/policy_avg": 0.25978559255599976, + "lr": 2.6637269938650307e-06, + "objective/entropy": -102.45779418945312, + "objective/kl": 11.285503387451172, + "objective/non_score_reward": -1.1285502910614014, + "objective/rlhf_reward": -4.114201521873474, + "objective/scores": 0.1, + "policy/approxkl_avg": 44.16554260253906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.646943211555481, + "step": 1754, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980204105377197 + }, + { + "episode": 28096, + "epoch": 0.5050149189344645, + "loss/policy_avg": 0.017559155821800232, + "lr": 2.6635352760736195e-06, + "objective/entropy": 48.8609619140625, + "objective/kl": 10.03787612915039, + "objective/non_score_reward": -1.0037877559661865, + "objective/rlhf_reward": -6.015151023864746, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.1227680444717407, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7357013821601868, + "step": 1755, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001687526702881 + }, + { + "episode": 28112, + "epoch": 0.5053025128518532, + "loss/policy_avg": 0.6664595603942871, + "lr": 2.6633435582822087e-06, + "objective/entropy": 269.71820068359375, + "objective/kl": 20.73150062561035, + "objective/non_score_reward": -2.073150157928467, + "objective/rlhf_reward": -5.892600929737091, + "objective/scores": 0.6, + "policy/approxkl_avg": 91.48931121826172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6852459907531738, + "step": 1756, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988569021224976 + }, + { + "episode": 28128, + "epoch": 0.5055901067692419, + "loss/policy_avg": 0.5310183763504028, + "lr": 2.6631518404907975e-06, + "objective/entropy": -99.25455474853516, + "objective/kl": 9.640950202941895, + "objective/non_score_reward": -0.9640949964523315, + "objective/rlhf_reward": -3.4563798666000363, + "objective/scores": 0.1, + "policy/approxkl_avg": 33.46493911743164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4926908612251282, + "step": 1757, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990811347961426 + }, + { + "episode": 28144, + "epoch": 0.5058777006866305, + "loss/policy_avg": 0.12302426993846893, + "lr": 2.6629601226993868e-06, + "objective/entropy": 26.574722290039062, + "objective/kl": 14.709476470947266, + "objective/non_score_reward": -1.4709476232528687, + "objective/rlhf_reward": -7.883790493011475, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.887361526489258, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7425447702407837, + "step": 1758, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000166416168213 + }, + { + "episode": 28160, + "epoch": 0.5061652946040192, + "loss/policy_avg": 0.12881505489349365, + "lr": 2.6627684049079756e-06, + "objective/entropy": 7.130855560302734, + "objective/kl": 14.024930000305176, + "objective/non_score_reward": -1.4024930000305176, + "objective/rlhf_reward": -5.2099721193313595, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.538719177246094, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6327688694000244, + "step": 1759, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0035862922668457 + }, + { + "episode": 28176, + "epoch": 0.5064528885214078, + "loss/policy_avg": 0.04699630290269852, + "lr": 2.6625766871165644e-06, + "objective/entropy": 207.1057891845703, + "objective/kl": 17.828094482421875, + "objective/non_score_reward": -1.7828094959259033, + "objective/rlhf_reward": -2.731237924098968, + "objective/scores": 1.1, + "policy/approxkl_avg": 48.873268127441406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5094485282897949, + "step": 1760, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982911348342896 + }, + { + "episode": 28192, + "epoch": 0.5067404824387964, + "loss/policy_avg": 0.2013719379901886, + "lr": 2.6623849693251536e-06, + "objective/entropy": -141.62539672851562, + "objective/kl": 11.267175674438477, + "objective/non_score_reward": -1.1267175674438477, + "objective/rlhf_reward": -0.1068704336881634, + "objective/scores": 1.1, + "policy/approxkl_avg": 11.064361572265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41444897651672363, + "step": 1761, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0017099380493164 + }, + { + "episode": 28208, + "epoch": 0.5070280763561851, + "loss/policy_avg": 0.541221022605896, + "lr": 2.6621932515337424e-06, + "objective/entropy": 206.86508178710938, + "objective/kl": 16.236549377441406, + "objective/non_score_reward": -1.623655080795288, + "objective/rlhf_reward": -8.494619369506836, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.70893096923828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7763517498970032, + "step": 1762, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991366863250732 + }, + { + "episode": 28224, + "epoch": 0.5073156702735737, + "loss/policy_avg": 0.4266519844532013, + "lr": 2.6620015337423317e-06, + "objective/entropy": -39.3725700378418, + "objective/kl": 19.346342086791992, + "objective/non_score_reward": -1.9346342086791992, + "objective/rlhf_reward": -3.3385367751121517, + "objective/scores": 1.1, + "policy/approxkl_avg": 95.08259582519531, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5408810973167419, + "step": 1763, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986789226531982 + }, + { + "episode": 28240, + "epoch": 0.5076032641909624, + "loss/policy_avg": 1.052474856376648, + "lr": 2.6618098159509205e-06, + "objective/entropy": 68.28462219238281, + "objective/kl": 17.100852966308594, + "objective/non_score_reward": -1.710085153579712, + "objective/rlhf_reward": -4.717634620443855, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 130.52293395996094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5202251672744751, + "step": 1764, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9974597692489624 + }, + { + "episode": 28256, + "epoch": 0.507890858108351, + "loss/policy_avg": 0.022500693798065186, + "lr": 2.6616180981595093e-06, + "objective/entropy": -7.4169158935546875, + "objective/kl": 15.090633392333984, + "objective/non_score_reward": -1.509063482284546, + "objective/rlhf_reward": -8.036253929138184, + "objective/scores": -0.5, + "policy/approxkl_avg": 60.160728454589844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7391468286514282, + "step": 1765, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971258640289307 + }, + { + "episode": 28272, + "epoch": 0.5081784520257396, + "loss/policy_avg": 0.23286819458007812, + "lr": 2.6614263803680985e-06, + "objective/entropy": -67.46270751953125, + "objective/kl": 9.249923706054688, + "objective/non_score_reward": -0.9249924421310425, + "objective/rlhf_reward": -5.699970245361328, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.629024088382721, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6066184043884277, + "step": 1766, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0047874450683594 + }, + { + "episode": 28288, + "epoch": 0.5084660459431283, + "loss/policy_avg": 0.4985557794570923, + "lr": 2.6612346625766873e-06, + "objective/entropy": 212.60858154296875, + "objective/kl": 17.220657348632812, + "objective/non_score_reward": -1.7220659255981445, + "objective/rlhf_reward": -8.888263702392578, + "objective/scores": -0.5, + "policy/approxkl_avg": 83.53437805175781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7002837657928467, + "step": 1767, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998044490814209 + }, + { + "episode": 28304, + "epoch": 0.5087536398605169, + "loss/policy_avg": 0.224037304520607, + "lr": 2.661042944785276e-06, + "objective/entropy": 146.78701782226562, + "objective/kl": 21.866727828979492, + "objective/non_score_reward": -2.1866729259490967, + "objective/rlhf_reward": -8.346691688895227, + "objective/scores": 0.1, + "policy/approxkl_avg": 42.53478240966797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6788499355316162, + "step": 1768, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997270107269287 + }, + { + "episode": 28320, + "epoch": 0.5090412337779056, + "loss/policy_avg": 0.11605388671159744, + "lr": 2.660851226993865e-06, + "objective/entropy": 204.34317016601562, + "objective/kl": 21.220643997192383, + "objective/non_score_reward": -2.1220645904541016, + "objective/rlhf_reward": -6.365551712290321, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 57.263092041015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5966320037841797, + "step": 1769, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974641799926758 + }, + { + "episode": 28336, + "epoch": 0.5093288276952942, + "loss/policy_avg": 0.058426156640052795, + "lr": 2.6606595092024538e-06, + "objective/entropy": 119.5937271118164, + "objective/kl": 10.65241527557373, + "objective/non_score_reward": -1.0652415752410889, + "objective/rlhf_reward": -3.860966151952743, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.677213668823242, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6696159243583679, + "step": 1770, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0011467933654785 + }, + { + "episode": 28352, + "epoch": 0.509616421612683, + "loss/policy_avg": 0.2700793743133545, + "lr": 2.660467791411043e-06, + "objective/entropy": 34.39029312133789, + "objective/kl": 17.694852828979492, + "objective/non_score_reward": -1.7694852352142334, + "objective/rlhf_reward": -4.6779410302639, + "objective/scores": 0.6, + "policy/approxkl_avg": 42.89826965332031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44230255484580994, + "step": 1771, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0011649131774902 + }, + { + "episode": 28368, + "epoch": 0.5099040155300716, + "loss/policy_avg": 0.22650663554668427, + "lr": 2.660276073619632e-06, + "objective/entropy": 62.46409606933594, + "objective/kl": 18.707399368286133, + "objective/non_score_reward": -1.8707401752471924, + "objective/rlhf_reward": -7.082960522174835, + "objective/scores": 0.1, + "policy/approxkl_avg": 122.68310546875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4607383608818054, + "step": 1772, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000152826309204 + }, + { + "episode": 28384, + "epoch": 0.5101916094474602, + "loss/policy_avg": 0.9882694482803345, + "lr": 2.660084355828221e-06, + "objective/entropy": 185.96481323242188, + "objective/kl": 18.96381378173828, + "objective/non_score_reward": -1.8963813781738281, + "objective/rlhf_reward": -3.185525274276733, + "objective/scores": 1.1, + "policy/approxkl_avg": 50.954315185546875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8979307413101196, + "step": 1773, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9962632656097412 + }, + { + "episode": 28400, + "epoch": 0.5104792033648489, + "loss/policy_avg": -0.10225933790206909, + "lr": 2.65989263803681e-06, + "objective/entropy": -2.4116439819335938, + "objective/kl": 10.670534133911133, + "objective/non_score_reward": -1.0670535564422607, + "objective/rlhf_reward": -1.868213868141174, + "objective/scores": 0.6, + "policy/approxkl_avg": 35.16023254394531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62814861536026, + "step": 1774, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0009822845458984 + }, + { + "episode": 28416, + "epoch": 0.5107667972822375, + "loss/policy_avg": 0.24588751792907715, + "lr": 2.6597009202453987e-06, + "objective/entropy": 126.577880859375, + "objective/kl": 14.00534439086914, + "objective/non_score_reward": -1.4005343914031982, + "objective/rlhf_reward": -2.6784184023153514, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 56.3424072265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.689423143863678, + "step": 1775, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982950687408447 + }, + { + "episode": 28432, + "epoch": 0.5110543911996261, + "loss/policy_avg": 0.263106107711792, + "lr": 2.659509202453988e-06, + "objective/entropy": -146.71322631835938, + "objective/kl": 18.640098571777344, + "objective/non_score_reward": -1.8640098571777344, + "objective/rlhf_reward": -9.456039428710938, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.64714050292969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46649169921875, + "step": 1776, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997348666191101 + }, + { + "episode": 28448, + "epoch": 0.5113419851170148, + "loss/policy_avg": -0.28649938106536865, + "lr": 2.6593174846625767e-06, + "objective/entropy": -46.71950149536133, + "objective/kl": 15.516162872314453, + "objective/non_score_reward": -1.5516164302825928, + "objective/rlhf_reward": -1.8064654231071469, + "objective/scores": 1.1, + "policy/approxkl_avg": 18.319168090820312, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.71426922082901, + "step": 1777, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.006796360015869 + }, + { + "episode": 28464, + "epoch": 0.5116295790344034, + "loss/policy_avg": 0.12089434266090393, + "lr": 2.6591257668711655e-06, + "objective/entropy": 116.71041107177734, + "objective/kl": 16.0557804107666, + "objective/non_score_reward": -1.605578064918518, + "objective/rlhf_reward": -6.022312170267105, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.31344985961914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5637420415878296, + "step": 1778, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995025396347046 + }, + { + "episode": 28480, + "epoch": 0.5119171729517921, + "loss/policy_avg": 0.5408236980438232, + "lr": 2.6589340490797547e-06, + "objective/entropy": -41.114479064941406, + "objective/kl": 13.265435218811035, + "objective/non_score_reward": -1.3265435695648193, + "objective/rlhf_reward": -4.906174151599407, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.033656120300293, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.48320022225379944, + "step": 1779, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000502586364746 + }, + { + "episode": 28496, + "epoch": 0.5122047668691807, + "loss/policy_avg": 0.5629329681396484, + "lr": 2.6587423312883435e-06, + "objective/entropy": 25.448585510253906, + "objective/kl": 10.900973320007324, + "objective/non_score_reward": -1.090097427368164, + "objective/rlhf_reward": -1.9603894189000126, + "objective/scores": 0.6, + "policy/approxkl_avg": 29.427316665649414, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7866188287734985, + "step": 1780, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992313385009766 + }, + { + "episode": 28512, + "epoch": 0.5124923607865693, + "loss/policy_avg": 0.3418985903263092, + "lr": 2.6585506134969328e-06, + "objective/entropy": 77.93727111816406, + "objective/kl": 11.338996887207031, + "objective/non_score_reward": -1.1338996887207031, + "objective/rlhf_reward": -1.6118795021784034, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 4.558671951293945, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4622589349746704, + "step": 1781, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.005500555038452 + }, + { + "episode": 28528, + "epoch": 0.512779954703958, + "loss/policy_avg": -0.2597534954547882, + "lr": 2.6583588957055216e-06, + "objective/entropy": 211.84303283691406, + "objective/kl": 15.383636474609375, + "objective/non_score_reward": -1.5383635759353638, + "objective/rlhf_reward": -3.2297350808393688, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 13.554239273071289, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7210692167282104, + "step": 1782, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0022835731506348 + }, + { + "episode": 28544, + "epoch": 0.5130675486213466, + "loss/policy_avg": 0.6771409511566162, + "lr": 2.6581671779141104e-06, + "objective/entropy": 82.52877044677734, + "objective/kl": 16.593626022338867, + "objective/non_score_reward": -1.659362554550171, + "objective/rlhf_reward": -6.237450411915779, + "objective/scores": 0.1, + "policy/approxkl_avg": 18.595134735107422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7311805486679077, + "step": 1783, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988240003585815 + }, + { + "episode": 28560, + "epoch": 0.5133551425387353, + "loss/policy_avg": 0.5855733156204224, + "lr": 2.6579754601226996e-06, + "objective/entropy": 78.42135620117188, + "objective/kl": 16.883634567260742, + "objective/non_score_reward": -1.6883635520935059, + "objective/rlhf_reward": -6.353453657031059, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.954368591308594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7775710821151733, + "step": 1784, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976449012756348 + }, + { + "episode": 28576, + "epoch": 0.5136427364561239, + "loss/policy_avg": -0.47379228472709656, + "lr": 2.6577837423312884e-06, + "objective/entropy": 183.82095336914062, + "objective/kl": 17.60445213317871, + "objective/non_score_reward": -1.7604451179504395, + "objective/rlhf_reward": -2.6417807102203366, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.96839141845703, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4322192072868347, + "step": 1785, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000903844833374 + }, + { + "episode": 28592, + "epoch": 0.5139303303735125, + "loss/policy_avg": 0.040809325873851776, + "lr": 2.6575920245398777e-06, + "objective/entropy": 219.00308227539062, + "objective/kl": 18.765731811523438, + "objective/non_score_reward": -1.8765733242034912, + "objective/rlhf_reward": -3.1062932968139645, + "objective/scores": 1.1, + "policy/approxkl_avg": 92.84384155273438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5429558753967285, + "step": 1786, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964911937713623 + }, + { + "episode": 28608, + "epoch": 0.5142179242909013, + "loss/policy_avg": 0.05377146229147911, + "lr": 2.6574003067484665e-06, + "objective/entropy": 90.84602355957031, + "objective/kl": 20.2624568939209, + "objective/non_score_reward": -2.026245594024658, + "objective/rlhf_reward": -7.704982435703277, + "objective/scores": 0.1, + "policy/approxkl_avg": 193.996337890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7286136150360107, + "step": 1787, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996244192123413 + }, + { + "episode": 28624, + "epoch": 0.5145055182082899, + "loss/policy_avg": -0.007620755583047867, + "lr": 2.6572085889570553e-06, + "objective/entropy": -268.669921875, + "objective/kl": 12.50502872467041, + "objective/non_score_reward": -1.2505029439926147, + "objective/rlhf_reward": -0.6020117759704586, + "objective/scores": 1.1, + "policy/approxkl_avg": 10.24152946472168, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6491585373878479, + "step": 1788, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9990335702896118 + }, + { + "episode": 28640, + "epoch": 0.5147931121256786, + "loss/policy_avg": 0.09260250627994537, + "lr": 2.6570168711656445e-06, + "objective/entropy": -22.395694732666016, + "objective/kl": 14.976754188537598, + "objective/non_score_reward": -1.4976755380630493, + "objective/rlhf_reward": -5.590702152252197, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.219722747802734, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5158121585845947, + "step": 1789, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9984734058380127 + }, + { + "episode": 28656, + "epoch": 0.5150807060430672, + "loss/policy_avg": 0.1599373072385788, + "lr": 2.656825153374233e-06, + "objective/entropy": -151.83506774902344, + "objective/kl": 13.787343978881836, + "objective/non_score_reward": -1.3787343502044678, + "objective/rlhf_reward": -1.1149376392364498, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.0728182792663574, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6705546379089355, + "step": 1790, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0042343139648438 + }, + { + "episode": 28672, + "epoch": 0.5153682999604559, + "loss/policy_avg": 0.5093972086906433, + "lr": 2.656633435582822e-06, + "objective/entropy": 189.69818115234375, + "objective/kl": 14.04090690612793, + "objective/non_score_reward": -1.4040907621383667, + "objective/rlhf_reward": -7.616363048553467, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.02149963378906, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7557144165039062, + "step": 1791, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006518363952637 + }, + { + "episode": 28688, + "epoch": 0.5156558938778445, + "loss/policy_avg": 0.37555861473083496, + "lr": 2.656441717791411e-06, + "objective/entropy": -76.02454376220703, + "objective/kl": 19.386255264282227, + "objective/non_score_reward": -1.938625693321228, + "objective/rlhf_reward": -6.198243289199427, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 76.97025299072266, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6998109817504883, + "step": 1792, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998724341392517 + }, + { + "episode": 28704, + "epoch": 0.5159434877952331, + "loss/policy_avg": 0.16148006916046143, + "lr": 2.6562499999999998e-06, + "objective/entropy": 62.66816329956055, + "objective/kl": 13.42384147644043, + "objective/non_score_reward": -1.3423840999603271, + "objective/rlhf_reward": -7.369536399841309, + "objective/scores": -0.5, + "policy/approxkl_avg": 58.304691314697266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5450599789619446, + "step": 1793, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999281644821167 + }, + { + "episode": 28720, + "epoch": 0.5162310817126218, + "loss/policy_avg": 0.27762842178344727, + "lr": 2.656058282208589e-06, + "objective/entropy": 103.40982055664062, + "objective/kl": 12.799996376037598, + "objective/non_score_reward": -1.2799994945526123, + "objective/rlhf_reward": -4.719998276233673, + "objective/scores": 0.1, + "policy/approxkl_avg": 4.410861015319824, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.637082576751709, + "step": 1794, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999462366104126 + }, + { + "episode": 28736, + "epoch": 0.5165186756300104, + "loss/policy_avg": 0.409230500459671, + "lr": 2.655866564417178e-06, + "objective/entropy": 232.2705841064453, + "objective/kl": 22.624011993408203, + "objective/non_score_reward": -2.2624013423919678, + "objective/rlhf_reward": -6.92689901806501, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 48.6843147277832, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6138824820518494, + "step": 1795, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988563060760498 + }, + { + "episode": 28752, + "epoch": 0.516806269547399, + "loss/policy_avg": 0.7418479919433594, + "lr": 2.655674846625767e-06, + "objective/entropy": -84.00031280517578, + "objective/kl": 14.806282043457031, + "objective/non_score_reward": -1.480628252029419, + "objective/rlhf_reward": -1.5225130677223202, + "objective/scores": 1.1, + "policy/approxkl_avg": 149.33465576171875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6433936953544617, + "step": 1796, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997800350189209 + }, + { + "episode": 28768, + "epoch": 0.5170938634647877, + "loss/policy_avg": 0.217472642660141, + "lr": 2.655483128834356e-06, + "objective/entropy": -59.4697380065918, + "objective/kl": 17.53496551513672, + "objective/non_score_reward": -1.7534964084625244, + "objective/rlhf_reward": -4.6139857232570645, + "objective/scores": 0.6, + "policy/approxkl_avg": 69.00981140136719, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49576112627983093, + "step": 1797, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9981770515441895 + }, + { + "episode": 28784, + "epoch": 0.5173814573821763, + "loss/policy_avg": 0.26614606380462646, + "lr": 2.6552914110429447e-06, + "objective/entropy": 188.75660705566406, + "objective/kl": 11.638067245483398, + "objective/non_score_reward": -1.1638067960739136, + "objective/rlhf_reward": -4.25522700548172, + "objective/scores": 0.1, + "policy/approxkl_avg": 58.32883071899414, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7997337579727173, + "step": 1798, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977247714996338 + }, + { + "episode": 28800, + "epoch": 0.517669051299565, + "loss/policy_avg": 0.12947332859039307, + "lr": 2.655099693251534e-06, + "objective/entropy": 151.32919311523438, + "objective/kl": 15.166569709777832, + "objective/non_score_reward": -1.5166568756103516, + "objective/rlhf_reward": -5.666627234220504, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.051401138305664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.699778139591217, + "step": 1799, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994456768035889 + }, + { + "episode": 28816, + "epoch": 0.5179566452169536, + "loss/policy_avg": -0.002175837755203247, + "lr": 2.6549079754601227e-06, + "objective/entropy": 35.160484313964844, + "objective/kl": 21.300491333007812, + "objective/non_score_reward": -2.130049228668213, + "objective/rlhf_reward": -10.520196914672852, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.026445388793945, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5395582914352417, + "step": 1800, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978394508361816 + }, + { + "episode": 28832, + "epoch": 0.5182442391343423, + "loss/policy_avg": 0.16893471777439117, + "lr": 2.654716257668712e-06, + "objective/entropy": 155.825439453125, + "objective/kl": 15.55232048034668, + "objective/non_score_reward": -1.555232048034668, + "objective/rlhf_reward": -1.8209277451038357, + "objective/scores": 1.1, + "policy/approxkl_avg": 118.18456268310547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5876666307449341, + "step": 1801, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9970424175262451 + }, + { + "episode": 28848, + "epoch": 0.518531833051731, + "loss/policy_avg": 0.22040048241615295, + "lr": 2.6545245398773007e-06, + "objective/entropy": -31.10854721069336, + "objective/kl": 15.175678253173828, + "objective/non_score_reward": -1.5175678730010986, + "objective/rlhf_reward": -3.9475650063910823, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 8.521119117736816, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6477307677268982, + "step": 1802, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999565839767456 + }, + { + "episode": 28864, + "epoch": 0.5188194269691196, + "loss/policy_avg": 0.7363486289978027, + "lr": 2.6543328220858896e-06, + "objective/entropy": -71.38179779052734, + "objective/kl": 14.623321533203125, + "objective/non_score_reward": -1.4623322486877441, + "objective/rlhf_reward": -1.4493289351463314, + "objective/scores": 1.1, + "policy/approxkl_avg": 117.98739624023438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7105915546417236, + "step": 1803, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9983479976654053 + }, + { + "episode": 28880, + "epoch": 0.5191070208865083, + "loss/policy_avg": 0.09098640829324722, + "lr": 2.654141104294479e-06, + "objective/entropy": 13.222919464111328, + "objective/kl": 16.781892776489258, + "objective/non_score_reward": -1.6781893968582153, + "objective/rlhf_reward": -8.712757110595703, + "objective/scores": -0.5, + "policy/approxkl_avg": 107.18089294433594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6652854084968567, + "step": 1804, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.000478744506836 + }, + { + "episode": 28896, + "epoch": 0.5193946148038969, + "loss/policy_avg": -0.08573319762945175, + "lr": 2.6539493865030676e-06, + "objective/entropy": 1.4567337036132812, + "objective/kl": 9.958293914794922, + "objective/non_score_reward": -0.9958294630050659, + "objective/rlhf_reward": 0.4166820734739307, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.789250373840332, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.8160400390625, + "step": 1805, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0015993118286133 + }, + { + "episode": 28912, + "epoch": 0.5196822087212856, + "loss/policy_avg": 0.21332049369812012, + "lr": 2.6537576687116564e-06, + "objective/entropy": 28.27048683166504, + "objective/kl": 11.810449600219727, + "objective/non_score_reward": -1.1810449361801147, + "objective/rlhf_reward": -2.3241798043251034, + "objective/scores": 0.6, + "policy/approxkl_avg": 3.751129627227783, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.644847571849823, + "step": 1806, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001784324645996 + }, + { + "episode": 28928, + "epoch": 0.5199698026386742, + "loss/policy_avg": 0.2684786915779114, + "lr": 2.6535659509202456e-06, + "objective/entropy": -41.66497039794922, + "objective/kl": 18.632709503173828, + "objective/non_score_reward": -1.863270878791809, + "objective/rlhf_reward": -7.0530836343765255, + "objective/scores": 0.1, + "policy/approxkl_avg": 50.73341751098633, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5328782796859741, + "step": 1807, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980311393737793 + }, + { + "episode": 28944, + "epoch": 0.5202573965560628, + "loss/policy_avg": 0.27005735039711, + "lr": 2.6533742331288345e-06, + "objective/entropy": -92.68614959716797, + "objective/kl": 16.1339054107666, + "objective/non_score_reward": -1.6133904457092285, + "objective/rlhf_reward": -4.053561812639236, + "objective/scores": 0.6, + "policy/approxkl_avg": 75.5761489868164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.435627818107605, + "step": 1808, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997697114944458 + }, + { + "episode": 28960, + "epoch": 0.5205449904734515, + "loss/policy_avg": 0.47697752714157104, + "lr": 2.6531825153374237e-06, + "objective/entropy": 70.55982208251953, + "objective/kl": 15.462162017822266, + "objective/non_score_reward": -1.5462161302566528, + "objective/rlhf_reward": -3.261145387531492, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 25.127525329589844, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5837845802307129, + "step": 1809, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995059967041016 + }, + { + "episode": 28976, + "epoch": 0.5208325843908401, + "loss/policy_avg": 0.0944967269897461, + "lr": 2.6529907975460125e-06, + "objective/entropy": -106.81289672851562, + "objective/kl": 12.209148406982422, + "objective/non_score_reward": -1.2209150791168213, + "objective/rlhf_reward": -1.9599410190593927, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 1.313828945159912, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5029849410057068, + "step": 1810, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0012412071228027 + }, + { + "episode": 28992, + "epoch": 0.5211201783082288, + "loss/policy_avg": 0.16460567712783813, + "lr": 2.6527990797546013e-06, + "objective/entropy": -46.76816940307617, + "objective/kl": 13.290376663208008, + "objective/non_score_reward": -1.3290376663208008, + "objective/rlhf_reward": -4.916150665283203, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.19758987426758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6998944282531738, + "step": 1811, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993230104446411 + }, + { + "episode": 29008, + "epoch": 0.5214077722256174, + "loss/policy_avg": 0.09649604558944702, + "lr": 2.65260736196319e-06, + "objective/entropy": -97.85505676269531, + "objective/kl": 13.006240844726562, + "objective/non_score_reward": -1.3006240129470825, + "objective/rlhf_reward": -4.802495850622654, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.20913314819336, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6500852108001709, + "step": 1812, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0012776851654053 + }, + { + "episode": 29024, + "epoch": 0.521695366143006, + "loss/policy_avg": 0.13328894972801208, + "lr": 2.652415644171779e-06, + "objective/entropy": 2.2023277282714844, + "objective/kl": 10.00713062286377, + "objective/non_score_reward": -1.0007131099700928, + "objective/rlhf_reward": -6.002852439880371, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.523300170898438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6957378387451172, + "step": 1813, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99851655960083 + }, + { + "episode": 29040, + "epoch": 0.5219829600603947, + "loss/policy_avg": 0.1445261836051941, + "lr": 2.652223926380368e-06, + "objective/entropy": -19.08232879638672, + "objective/kl": 10.624837875366211, + "objective/non_score_reward": -1.0624839067459106, + "objective/rlhf_reward": -2.6936763813167364, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 14.453914642333984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7210265398025513, + "step": 1814, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9996994733810425 + }, + { + "episode": 29056, + "epoch": 0.5222705539777833, + "loss/policy_avg": -0.1162421777844429, + "lr": 2.652032208588957e-06, + "objective/entropy": 148.0369873046875, + "objective/kl": 11.403135299682617, + "objective/non_score_reward": -1.1403135061264038, + "objective/rlhf_reward": -2.1612541288137432, + "objective/scores": 0.6, + "policy/approxkl_avg": 45.55539321899414, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6635168194770813, + "step": 1815, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.039712429046631 + }, + { + "episode": 29072, + "epoch": 0.522558147895172, + "loss/policy_avg": 0.2675674557685852, + "lr": 2.6518404907975458e-06, + "objective/entropy": -52.286190032958984, + "objective/kl": 18.950439453125, + "objective/non_score_reward": -1.8950438499450684, + "objective/rlhf_reward": -5.7553467109528285, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 27.757678985595703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6635863780975342, + "step": 1816, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989075660705566 + }, + { + "episode": 29088, + "epoch": 0.5228457418125607, + "loss/policy_avg": 0.2123212069272995, + "lr": 2.651648773006135e-06, + "objective/entropy": -30.773658752441406, + "objective/kl": 13.214252471923828, + "objective/non_score_reward": -1.321425199508667, + "objective/rlhf_reward": -7.285700798034668, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.348447799682617, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6751729846000671, + "step": 1817, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9963544607162476 + }, + { + "episode": 29104, + "epoch": 0.5231333357299494, + "loss/policy_avg": 1.2047995328903198, + "lr": 2.651457055214724e-06, + "objective/entropy": 261.843505859375, + "objective/kl": 20.371292114257812, + "objective/non_score_reward": -2.0371291637420654, + "objective/rlhf_reward": -7.74851701259613, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.547252655029297, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.940751314163208, + "step": 1818, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997062683105469 + }, + { + "episode": 29120, + "epoch": 0.523420929647338, + "loss/policy_avg": 0.37461650371551514, + "lr": 2.651265337423313e-06, + "objective/entropy": -122.65673828125, + "objective/kl": 16.444629669189453, + "objective/non_score_reward": -1.6444628238677979, + "objective/rlhf_reward": -5.021592318025187, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 67.11688995361328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5965983867645264, + "step": 1819, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979946613311768 + }, + { + "episode": 29136, + "epoch": 0.5237085235647266, + "loss/policy_avg": -0.005302314180880785, + "lr": 2.651073619631902e-06, + "objective/entropy": 36.619300842285156, + "objective/kl": 19.354164123535156, + "objective/non_score_reward": -1.9354164600372314, + "objective/rlhf_reward": -3.341666108369827, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.855372428894043, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6970053911209106, + "step": 1820, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0006046295166016 + }, + { + "episode": 29152, + "epoch": 0.5239961174821153, + "loss/policy_avg": 0.0998329371213913, + "lr": 2.6508819018404907e-06, + "objective/entropy": 48.56463623046875, + "objective/kl": 16.415325164794922, + "objective/non_score_reward": -1.64153254032135, + "objective/rlhf_reward": -6.166130459308624, + "objective/scores": 0.1, + "policy/approxkl_avg": 26.19106674194336, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7929558753967285, + "step": 1821, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998055696487427 + }, + { + "episode": 29168, + "epoch": 0.5242837113995039, + "loss/policy_avg": 1.2181178331375122, + "lr": 2.65069018404908e-06, + "objective/entropy": 29.835037231445312, + "objective/kl": 14.585583686828613, + "objective/non_score_reward": -1.4585583209991455, + "objective/rlhf_reward": -1.4342335820198056, + "objective/scores": 1.1, + "policy/approxkl_avg": 46.434059143066406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6647939682006836, + "step": 1822, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9998629093170166 + }, + { + "episode": 29184, + "epoch": 0.5245713053168926, + "loss/policy_avg": -0.3791189193725586, + "lr": 2.6504984662576687e-06, + "objective/entropy": -51.36210250854492, + "objective/kl": 11.42615795135498, + "objective/non_score_reward": -1.1426159143447876, + "objective/rlhf_reward": -1.6467445834886758, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.951854705810547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4953696131706238, + "step": 1823, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.003664970397949 + }, + { + "episode": 29200, + "epoch": 0.5248588992342812, + "loss/policy_avg": 0.2820661962032318, + "lr": 2.650306748466258e-06, + "objective/entropy": 51.438140869140625, + "objective/kl": 12.149774551391602, + "objective/non_score_reward": -1.214977502822876, + "objective/rlhf_reward": -4.4599096983671185, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.368728637695312, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.477517306804657, + "step": 1824, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998262643814087 + }, + { + "episode": 29216, + "epoch": 0.5251464931516698, + "loss/policy_avg": -0.019476689398288727, + "lr": 2.6501150306748468e-06, + "objective/entropy": -204.80804443359375, + "objective/kl": 12.380779266357422, + "objective/non_score_reward": -1.238077998161316, + "objective/rlhf_reward": -0.5523121416568753, + "objective/scores": 1.1, + "policy/approxkl_avg": 85.80036163330078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.704627275466919, + "step": 1825, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0015084743499756 + }, + { + "episode": 29232, + "epoch": 0.5254340870690585, + "loss/policy_avg": -0.14664429426193237, + "lr": 2.6499233128834356e-06, + "objective/entropy": 72.73933410644531, + "objective/kl": 15.728586196899414, + "objective/non_score_reward": -1.5728585720062256, + "objective/rlhf_reward": -8.291434288024902, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.642614364624023, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8739404082298279, + "step": 1826, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0036849975585938 + }, + { + "episode": 29248, + "epoch": 0.5257216809864471, + "loss/policy_avg": 0.25007522106170654, + "lr": 2.649731595092025e-06, + "objective/entropy": -111.04607391357422, + "objective/kl": 9.555280685424805, + "objective/non_score_reward": -0.9555281400680542, + "objective/rlhf_reward": -1.422112500667572, + "objective/scores": 0.6, + "policy/approxkl_avg": 7.105292320251465, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6159435510635376, + "step": 1827, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995931386947632 + }, + { + "episode": 29264, + "epoch": 0.5260092749038358, + "loss/policy_avg": 0.14585937559604645, + "lr": 2.6495398773006136e-06, + "objective/entropy": 27.570262908935547, + "objective/kl": 17.47567367553711, + "objective/non_score_reward": -1.7475672960281372, + "objective/rlhf_reward": -8.99026870727539, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.218692779541016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5340030193328857, + "step": 1828, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987754821777344 + }, + { + "episode": 29280, + "epoch": 0.5262968688212244, + "loss/policy_avg": 0.6246511936187744, + "lr": 2.6493481595092024e-06, + "objective/entropy": -51.908145904541016, + "objective/kl": 16.57758140563965, + "objective/non_score_reward": -1.6577579975128174, + "objective/rlhf_reward": -8.63103199005127, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.057701110839844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5691357254981995, + "step": 1829, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991605281829834 + }, + { + "episode": 29296, + "epoch": 0.526584462738613, + "loss/policy_avg": 0.12075556814670563, + "lr": 2.6491564417177916e-06, + "objective/entropy": 76.26769256591797, + "objective/kl": 14.916351318359375, + "objective/non_score_reward": -1.4916352033615112, + "objective/rlhf_reward": -5.566540813446045, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.57536506652832, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6550225019454956, + "step": 1830, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0020956993103027 + }, + { + "episode": 29312, + "epoch": 0.5268720566560017, + "loss/policy_avg": 0.2506754696369171, + "lr": 2.6489647239263805e-06, + "objective/entropy": -35.4891242980957, + "objective/kl": 14.797635078430176, + "objective/non_score_reward": -1.4797635078430176, + "objective/rlhf_reward": -5.519054269790649, + "objective/scores": 0.1, + "policy/approxkl_avg": 36.26906204223633, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7706562280654907, + "step": 1831, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9965496063232422 + }, + { + "episode": 29328, + "epoch": 0.5271596505733904, + "loss/policy_avg": 0.1543758064508438, + "lr": 2.6487730061349697e-06, + "objective/entropy": 67.9344482421875, + "objective/kl": 13.906883239746094, + "objective/non_score_reward": -1.390688419342041, + "objective/rlhf_reward": -1.1627536773681637, + "objective/scores": 1.1, + "policy/approxkl_avg": 70.80870056152344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6718401908874512, + "step": 1832, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979827404022217 + }, + { + "episode": 29344, + "epoch": 0.5274472444907791, + "loss/policy_avg": 0.05845007672905922, + "lr": 2.6485812883435585e-06, + "objective/entropy": -174.62997436523438, + "objective/kl": 13.782123565673828, + "objective/non_score_reward": -1.3782124519348145, + "objective/rlhf_reward": -2.5891307338487834, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 21.229196548461914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8168323040008545, + "step": 1833, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.9980207681655884 + }, + { + "episode": 29360, + "epoch": 0.5277348384081677, + "loss/policy_avg": 0.12811380624771118, + "lr": 2.6483895705521473e-06, + "objective/entropy": -122.46810913085938, + "objective/kl": 18.168519973754883, + "objective/non_score_reward": -1.8168519735336304, + "objective/rlhf_reward": -6.867408013343811, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.205116271972656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6772712469100952, + "step": 1834, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982894659042358 + }, + { + "episode": 29376, + "epoch": 0.5280224323255563, + "loss/policy_avg": 0.10961230099201202, + "lr": 2.648197852760736e-06, + "objective/entropy": 32.48823547363281, + "objective/kl": 21.051025390625, + "objective/non_score_reward": -2.1051025390625, + "objective/rlhf_reward": -10.42041015625, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.921422481536865, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8856111168861389, + "step": 1835, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0003602504730225 + }, + { + "episode": 29392, + "epoch": 0.528310026242945, + "loss/policy_avg": 0.2646952271461487, + "lr": 2.648006134969325e-06, + "objective/entropy": 106.65617370605469, + "objective/kl": 18.93328857421875, + "objective/non_score_reward": -1.8933287858963013, + "objective/rlhf_reward": -4.649596397520277, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 219.3662567138672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4539373517036438, + "step": 1836, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997753620147705 + }, + { + "episode": 29408, + "epoch": 0.5285976201603336, + "loss/policy_avg": 0.18501770496368408, + "lr": 2.647814417177914e-06, + "objective/entropy": -203.39031982421875, + "objective/kl": 13.033735275268555, + "objective/non_score_reward": -1.3033735752105713, + "objective/rlhf_reward": -4.813494479656219, + "objective/scores": 0.1, + "policy/approxkl_avg": 58.33476638793945, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5079236030578613, + "step": 1837, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998493194580078 + }, + { + "episode": 29424, + "epoch": 0.5288852140777223, + "loss/policy_avg": 0.2593972384929657, + "lr": 2.647622699386503e-06, + "objective/entropy": 169.56747436523438, + "objective/kl": 20.271284103393555, + "objective/non_score_reward": -2.0271284580230713, + "objective/rlhf_reward": -10.108513832092285, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.37139129638672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6737958192825317, + "step": 1838, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973093271255493 + }, + { + "episode": 29440, + "epoch": 0.5291728079951109, + "loss/policy_avg": 0.15093323588371277, + "lr": 2.647430981595092e-06, + "objective/entropy": 173.2559814453125, + "objective/kl": 12.594919204711914, + "objective/non_score_reward": -1.2594919204711914, + "objective/rlhf_reward": -4.637967845797538, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.31730842590332, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.664380669593811, + "step": 1839, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000157594680786 + }, + { + "episode": 29456, + "epoch": 0.5294604019124995, + "loss/policy_avg": 0.09447266906499863, + "lr": 2.647239263803681e-06, + "objective/entropy": 161.65419006347656, + "objective/kl": 8.186479568481445, + "objective/non_score_reward": -0.8186479806900024, + "objective/rlhf_reward": -2.874591892957687, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.823326110839844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8956143856048584, + "step": 1840, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996443748474121 + }, + { + "episode": 29472, + "epoch": 0.5297479958298882, + "loss/policy_avg": 0.24017485976219177, + "lr": 2.64704754601227e-06, + "objective/entropy": 194.16900634765625, + "objective/kl": 16.956283569335938, + "objective/non_score_reward": -1.6956284046173096, + "objective/rlhf_reward": -8.782513618469238, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.098064422607422, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7093454599380493, + "step": 1841, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990488290786743 + }, + { + "episode": 29488, + "epoch": 0.5300355897472768, + "loss/policy_avg": 0.23009948432445526, + "lr": 2.646855828220859e-06, + "objective/entropy": 191.48776245117188, + "objective/kl": 8.397547721862793, + "objective/non_score_reward": -0.8397548198699951, + "objective/rlhf_reward": -2.9590191602706906, + "objective/scores": 0.1, + "policy/approxkl_avg": 93.32293701171875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46769386529922485, + "step": 1842, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982507228851318 + }, + { + "episode": 29504, + "epoch": 0.5303231836646655, + "loss/policy_avg": 0.31807905435562134, + "lr": 2.646664110429448e-06, + "objective/entropy": 190.9818115234375, + "objective/kl": 17.607507705688477, + "objective/non_score_reward": -1.7607507705688477, + "objective/rlhf_reward": -2.6430031716823574, + "objective/scores": 1.1, + "policy/approxkl_avg": 27.155506134033203, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6137632727622986, + "step": 1843, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9971423149108887 + }, + { + "episode": 29520, + "epoch": 0.5306107775820541, + "loss/policy_avg": 0.5301668643951416, + "lr": 2.6464723926380367e-06, + "objective/entropy": 81.72561645507812, + "objective/kl": 16.590045928955078, + "objective/non_score_reward": -1.659004807472229, + "objective/rlhf_reward": -6.236018991470337, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.18244743347168, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8008009195327759, + "step": 1844, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989930391311646 + }, + { + "episode": 29536, + "epoch": 0.5308983714994427, + "loss/policy_avg": -0.15928786993026733, + "lr": 2.646280674846626e-06, + "objective/entropy": 124.23045349121094, + "objective/kl": 16.684246063232422, + "objective/non_score_reward": -1.6684244871139526, + "objective/rlhf_reward": -4.726286898331578, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 19.319072723388672, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7122434377670288, + "step": 1845, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003610134124756 + }, + { + "episode": 29552, + "epoch": 0.5311859654168314, + "loss/policy_avg": 0.3252880573272705, + "lr": 2.6460889570552147e-06, + "objective/entropy": 95.69757843017578, + "objective/kl": 17.60025405883789, + "objective/non_score_reward": -1.7600253820419312, + "objective/rlhf_reward": -5.092690105514462, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 82.03265380859375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.771241307258606, + "step": 1846, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977837800979614 + }, + { + "episode": 29568, + "epoch": 0.5314735593342201, + "loss/policy_avg": 0.23750726878643036, + "lr": 2.645897239263804e-06, + "objective/entropy": 186.9475555419922, + "objective/kl": 13.092645645141602, + "objective/non_score_reward": -1.3092646598815918, + "objective/rlhf_reward": -4.837058520317077, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.29829216003418, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6885484457015991, + "step": 1847, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0034914016723633 + }, + { + "episode": 29584, + "epoch": 0.5317611532516088, + "loss/policy_avg": 0.2734951376914978, + "lr": 2.6457055214723928e-06, + "objective/entropy": -124.91612243652344, + "objective/kl": 15.988296508789062, + "objective/non_score_reward": -1.5988296270370483, + "objective/rlhf_reward": -8.395318031311035, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.176590442657471, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7133872509002686, + "step": 1848, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998915672302246 + }, + { + "episode": 29600, + "epoch": 0.5320487471689974, + "loss/policy_avg": 0.6326997876167297, + "lr": 2.6455138036809816e-06, + "objective/entropy": 102.43971252441406, + "objective/kl": 14.527070999145508, + "objective/non_score_reward": -1.452707052230835, + "objective/rlhf_reward": -3.863417278008397, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 13.749048233032227, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6314961910247803, + "step": 1849, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9977028369903564 + }, + { + "episode": 29616, + "epoch": 0.5323363410863861, + "loss/policy_avg": 0.018764328211545944, + "lr": 2.645322085889571e-06, + "objective/entropy": 173.0929718017578, + "objective/kl": 13.599279403686523, + "objective/non_score_reward": -1.359928011894226, + "objective/rlhf_reward": -7.439712047576904, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.08881187438965, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6948906779289246, + "step": 1850, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987165927886963 + }, + { + "episode": 29632, + "epoch": 0.5326239350037747, + "loss/policy_avg": 0.8213224411010742, + "lr": 2.6451303680981596e-06, + "objective/entropy": 84.24604797363281, + "objective/kl": 19.240615844726562, + "objective/non_score_reward": -1.92406165599823, + "objective/rlhf_reward": -6.092126611534672, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 32.59326171875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7166078090667725, + "step": 1851, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984686374664307 + }, + { + "episode": 29648, + "epoch": 0.5329115289211633, + "loss/policy_avg": 0.20839475095272064, + "lr": 2.644938650306749e-06, + "objective/entropy": 143.82113647460938, + "objective/kl": 16.232925415039062, + "objective/non_score_reward": -1.6232926845550537, + "objective/rlhf_reward": -2.093170380592346, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.679071426391602, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7232511043548584, + "step": 1852, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998900890350342 + }, + { + "episode": 29664, + "epoch": 0.533199122838552, + "loss/policy_avg": 0.07763684540987015, + "lr": 2.6447469325153377e-06, + "objective/entropy": -36.77897262573242, + "objective/kl": 16.796016693115234, + "objective/non_score_reward": -1.6796014308929443, + "objective/rlhf_reward": -6.318406081199646, + "objective/scores": 0.1, + "policy/approxkl_avg": 42.441017150878906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6772186756134033, + "step": 1853, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989824295043945 + }, + { + "episode": 29680, + "epoch": 0.5334867167559406, + "loss/policy_avg": 0.025462936609983444, + "lr": 2.6445552147239265e-06, + "objective/entropy": 132.22561645507812, + "objective/kl": 9.946213722229004, + "objective/non_score_reward": -0.9946214556694031, + "objective/rlhf_reward": -3.5784857928752896, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.1699419021606445, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5213096737861633, + "step": 1854, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997193813323975 + }, + { + "episode": 29696, + "epoch": 0.5337743106733293, + "loss/policy_avg": 0.8330253958702087, + "lr": 2.6443634969325157e-06, + "objective/entropy": 294.1936950683594, + "objective/kl": 16.371532440185547, + "objective/non_score_reward": -1.6371533870697021, + "objective/rlhf_reward": -8.548613548278809, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.149263381958008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9009605050086975, + "step": 1855, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9972525835037231 + }, + { + "episode": 29712, + "epoch": 0.5340619045907179, + "loss/policy_avg": 0.07781802117824554, + "lr": 2.644171779141104e-06, + "objective/entropy": 165.48971557617188, + "objective/kl": 18.383617401123047, + "objective/non_score_reward": -1.8383618593215942, + "objective/rlhf_reward": -6.953447675704956, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.224011421203613, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9337248802185059, + "step": 1856, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977425336837769 + }, + { + "episode": 29728, + "epoch": 0.5343494985081065, + "loss/policy_avg": 0.3984661102294922, + "lr": 2.6439800613496933e-06, + "objective/entropy": 0.33826446533203125, + "objective/kl": 11.344948768615723, + "objective/non_score_reward": -1.1344949007034302, + "objective/rlhf_reward": -2.4152732513108592, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 17.914337158203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7256457805633545, + "step": 1857, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982662200927734 + }, + { + "episode": 29744, + "epoch": 0.5346370924254952, + "loss/policy_avg": 0.19412755966186523, + "lr": 2.643788343558282e-06, + "objective/entropy": 99.91259765625, + "objective/kl": 22.232093811035156, + "objective/non_score_reward": -2.2232093811035156, + "objective/rlhf_reward": -10.892837524414062, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.14105987548828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7228943109512329, + "step": 1858, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999452829360962 + }, + { + "episode": 29760, + "epoch": 0.5349246863428838, + "loss/policy_avg": 0.44888734817504883, + "lr": 2.643596625766871e-06, + "objective/entropy": 206.21575927734375, + "objective/kl": 17.497486114501953, + "objective/non_score_reward": -1.749748706817627, + "objective/rlhf_reward": -4.876288416163002, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 157.12306213378906, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.598418116569519, + "step": 1859, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997058629989624 + }, + { + "episode": 29776, + "epoch": 0.5352122802602725, + "loss/policy_avg": 0.11433567106723785, + "lr": 2.64340490797546e-06, + "objective/entropy": -88.20219421386719, + "objective/kl": 11.040931701660156, + "objective/non_score_reward": -1.104093074798584, + "objective/rlhf_reward": -2.4689614725875213, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 9.95333480834961, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6028738021850586, + "step": 1860, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000257730484009 + }, + { + "episode": 29792, + "epoch": 0.5354998741776611, + "loss/policy_avg": 0.45190608501434326, + "lr": 2.643213190184049e-06, + "objective/entropy": 223.72943115234375, + "objective/kl": 17.677797317504883, + "objective/non_score_reward": -1.76777982711792, + "objective/rlhf_reward": -4.14740041339514, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 76.53868103027344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5580284595489502, + "step": 1861, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989244937896729 + }, + { + "episode": 29808, + "epoch": 0.5357874680950497, + "loss/policy_avg": 0.5275196433067322, + "lr": 2.643021472392638e-06, + "objective/entropy": -49.960601806640625, + "objective/kl": 15.230354309082031, + "objective/non_score_reward": -1.5230354070663452, + "objective/rlhf_reward": -8.092142105102539, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.79082489013672, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7794216275215149, + "step": 1862, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985644817352295 + }, + { + "episode": 29824, + "epoch": 0.5360750620124385, + "loss/policy_avg": 1.3458707332611084, + "lr": 2.642829754601227e-06, + "objective/entropy": -221.41986083984375, + "objective/kl": 9.127204895019531, + "objective/non_score_reward": -0.9127205014228821, + "objective/rlhf_reward": -1.7034706575440723, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 18.279823303222656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.560336172580719, + "step": 1863, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9984030723571777 + }, + { + "episode": 29840, + "epoch": 0.5363626559298271, + "loss/policy_avg": 0.23017987608909607, + "lr": 2.642638036809816e-06, + "objective/entropy": 99.97102355957031, + "objective/kl": 18.287355422973633, + "objective/non_score_reward": -1.8287353515625, + "objective/rlhf_reward": -9.31494140625, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.416362762451172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.539484977722168, + "step": 1864, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988839626312256 + }, + { + "episode": 29856, + "epoch": 0.5366502498472158, + "loss/policy_avg": 0.11260244250297546, + "lr": 2.642446319018405e-06, + "objective/entropy": 68.40955352783203, + "objective/kl": 17.344467163085938, + "objective/non_score_reward": -1.7344467639923096, + "objective/rlhf_reward": -2.537786996364593, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.906153678894043, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6735920906066895, + "step": 1865, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000178813934326 + }, + { + "episode": 29872, + "epoch": 0.5369378437646044, + "loss/policy_avg": 0.5199981927871704, + "lr": 2.642254601226994e-06, + "objective/entropy": 223.3328857421875, + "objective/kl": 10.768610000610352, + "objective/non_score_reward": -1.076861023902893, + "objective/rlhf_reward": -6.307444095611572, + "objective/scores": -0.5, + "policy/approxkl_avg": 47.59469223022461, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5734953880310059, + "step": 1866, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990028142929077 + }, + { + "episode": 29888, + "epoch": 0.537225437681993, + "loss/policy_avg": 0.10744701325893402, + "lr": 2.6420628834355827e-06, + "objective/entropy": 63.99552536010742, + "objective/kl": 14.580144882202148, + "objective/non_score_reward": -1.4580143690109253, + "objective/rlhf_reward": -5.43205771446228, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.58818817138672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6608311533927917, + "step": 1867, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967219829559326 + }, + { + "episode": 29904, + "epoch": 0.5375130315993817, + "loss/policy_avg": 0.44945427775382996, + "lr": 2.641871165644172e-06, + "objective/entropy": -281.01654052734375, + "objective/kl": 14.940108299255371, + "objective/non_score_reward": -1.4940109252929688, + "objective/rlhf_reward": -3.5760435372591015, + "objective/scores": 0.6, + "policy/approxkl_avg": 74.54122924804688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5839606523513794, + "step": 1868, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974722862243652 + }, + { + "episode": 29920, + "epoch": 0.5378006255167703, + "loss/policy_avg": -0.037052758038043976, + "lr": 2.6416794478527607e-06, + "objective/entropy": 179.74900817871094, + "objective/kl": 15.392721176147461, + "objective/non_score_reward": -1.5392720699310303, + "objective/rlhf_reward": -3.2333691015255184, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 66.53801727294922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6649525165557861, + "step": 1869, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002619743347168 + }, + { + "episode": 29936, + "epoch": 0.538088219434159, + "loss/policy_avg": 0.15332826972007751, + "lr": 2.64148773006135e-06, + "objective/entropy": 219.11968994140625, + "objective/kl": 11.19294261932373, + "objective/non_score_reward": -1.119294285774231, + "objective/rlhf_reward": -1.553457964898321, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 17.272003173828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5493988990783691, + "step": 1870, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000128746032715 + }, + { + "episode": 29952, + "epoch": 0.5383758133515476, + "loss/policy_avg": 0.3887026011943817, + "lr": 2.6412960122699388e-06, + "objective/entropy": 327.54498291015625, + "objective/kl": 15.907135963439941, + "objective/non_score_reward": -1.5907135009765625, + "objective/rlhf_reward": -8.36285400390625, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.208090782165527, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 1.0575509071350098, + "step": 1871, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971492290496826 + }, + { + "episode": 29968, + "epoch": 0.5386634072689362, + "loss/policy_avg": 0.703474760055542, + "lr": 2.6411042944785276e-06, + "objective/entropy": 128.798095703125, + "objective/kl": 14.402985572814941, + "objective/non_score_reward": -1.4402985572814941, + "objective/rlhf_reward": -3.6384882948556285, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 14.528717041015625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6321216225624084, + "step": 1872, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003087520599365 + }, + { + "episode": 29984, + "epoch": 0.5389510011863249, + "loss/policy_avg": 0.5478439331054688, + "lr": 2.640912576687117e-06, + "objective/entropy": 42.24315643310547, + "objective/kl": 17.929065704345703, + "objective/non_score_reward": -1.7929065227508545, + "objective/rlhf_reward": -9.171626091003418, + "objective/scores": -0.5, + "policy/approxkl_avg": 78.95010375976562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6490499973297119, + "step": 1873, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995921850204468 + }, + { + "episode": 30000, + "epoch": 0.5392385951037135, + "loss/policy_avg": 0.30609291791915894, + "lr": 2.6407208588957056e-06, + "objective/entropy": 126.87837982177734, + "objective/kl": 12.08336067199707, + "objective/non_score_reward": -1.2083361148834229, + "objective/rlhf_reward": -4.433344638347625, + "objective/scores": 0.1, + "policy/approxkl_avg": 78.1723403930664, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4774414300918579, + "step": 1874, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979808330535889 + }, + { + "episode": 30016, + "epoch": 0.5395261890211022, + "loss/policy_avg": 0.1784551441669464, + "lr": 2.640529141104295e-06, + "objective/entropy": -67.91968536376953, + "objective/kl": 12.234102249145508, + "objective/non_score_reward": -1.2234103679656982, + "objective/rlhf_reward": -3.068812485012125, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 7.247363567352295, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6835782527923584, + "step": 1875, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000981330871582 + }, + { + "episode": 30032, + "epoch": 0.5398137829384908, + "loss/policy_avg": 0.07866216450929642, + "lr": 2.6403374233128837e-06, + "objective/entropy": 31.470932006835938, + "objective/kl": 18.203540802001953, + "objective/non_score_reward": -1.8203542232513428, + "objective/rlhf_reward": -6.881416893005371, + "objective/scores": 0.1, + "policy/approxkl_avg": 223.04364013671875, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6657415628433228, + "step": 1876, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984439611434937 + }, + { + "episode": 30048, + "epoch": 0.5401013768558794, + "loss/policy_avg": 0.3167308568954468, + "lr": 2.6401457055214725e-06, + "objective/entropy": 320.1375732421875, + "objective/kl": 18.73411750793457, + "objective/non_score_reward": -1.8734118938446045, + "objective/rlhf_reward": -3.093647754192352, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.82015228271484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.857011079788208, + "step": 1877, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9953978061676025 + }, + { + "episode": 30064, + "epoch": 0.5403889707732682, + "loss/policy_avg": 0.21011970937252045, + "lr": 2.6399539877300617e-06, + "objective/entropy": 133.35537719726562, + "objective/kl": 14.358227729797363, + "objective/non_score_reward": -1.4358227252960205, + "objective/rlhf_reward": -5.343290990591049, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.1520862579345703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5805197954177856, + "step": 1878, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999664545059204 + }, + { + "episode": 30080, + "epoch": 0.5406765646906568, + "loss/policy_avg": 0.21725696325302124, + "lr": 2.63976226993865e-06, + "objective/entropy": 126.29757690429688, + "objective/kl": 13.66500473022461, + "objective/non_score_reward": -1.3665004968643188, + "objective/rlhf_reward": -3.5185906989144637, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 18.468700408935547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.32678791880607605, + "step": 1879, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992204904556274 + }, + { + "episode": 30096, + "epoch": 0.5409641586080455, + "loss/policy_avg": 0.3724169135093689, + "lr": 2.6395705521472393e-06, + "objective/entropy": 23.84024429321289, + "objective/kl": 14.622039794921875, + "objective/non_score_reward": -1.4622039794921875, + "objective/rlhf_reward": -7.84881591796875, + "objective/scores": -0.5, + "policy/approxkl_avg": 58.616756439208984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6998543739318848, + "step": 1880, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997481107711792 + }, + { + "episode": 30112, + "epoch": 0.5412517525254341, + "loss/policy_avg": 0.09946730732917786, + "lr": 2.639378834355828e-06, + "objective/entropy": 73.49078369140625, + "objective/kl": 20.70635986328125, + "objective/non_score_reward": -2.0706357955932617, + "objective/rlhf_reward": -7.882543003559112, + "objective/scores": 0.1, + "policy/approxkl_avg": 55.02135467529297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6006935834884644, + "step": 1881, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999374508857727 + }, + { + "episode": 30128, + "epoch": 0.5415393464428228, + "loss/policy_avg": 0.25386878848075867, + "lr": 2.639187116564417e-06, + "objective/entropy": 15.929668426513672, + "objective/kl": 19.90158462524414, + "objective/non_score_reward": -1.9901586771011353, + "objective/rlhf_reward": -5.560634529590606, + "objective/scores": 0.6, + "policy/approxkl_avg": 44.23625183105469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6773188710212708, + "step": 1882, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970426559448242 + }, + { + "episode": 30144, + "epoch": 0.5418269403602114, + "loss/policy_avg": 2.0969796180725098, + "lr": 2.638995398773006e-06, + "objective/entropy": 52.69280242919922, + "objective/kl": 18.107351303100586, + "objective/non_score_reward": -1.8107349872589111, + "objective/rlhf_reward": -6.84293989688158, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.367340087890625, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7185496091842651, + "step": 1883, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999009370803833 + }, + { + "episode": 30160, + "epoch": 0.5421145342776, + "loss/policy_avg": -0.183127298951149, + "lr": 2.638803680981595e-06, + "objective/entropy": -332.404296875, + "objective/kl": 13.645541191101074, + "objective/non_score_reward": -1.3645542860031128, + "objective/rlhf_reward": -3.796357398450957, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 1.9626456499099731, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7224613428115845, + "step": 1884, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0001583099365234 + }, + { + "episode": 30176, + "epoch": 0.5424021281949887, + "loss/policy_avg": 0.23557086288928986, + "lr": 2.6386119631901842e-06, + "objective/entropy": 119.83067321777344, + "objective/kl": 15.023857116699219, + "objective/non_score_reward": -1.5023858547210693, + "objective/rlhf_reward": -1.6095432996749874, + "objective/scores": 1.1, + "policy/approxkl_avg": 49.335533142089844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6670380234718323, + "step": 1885, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991014003753662 + }, + { + "episode": 30192, + "epoch": 0.5426897221123773, + "loss/policy_avg": -0.24799206852912903, + "lr": 2.638420245398773e-06, + "objective/entropy": -135.42721557617188, + "objective/kl": 16.508018493652344, + "objective/non_score_reward": -1.6508018970489502, + "objective/rlhf_reward": -6.203207468986511, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.26435852050781, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4349724054336548, + "step": 1886, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996954321861267 + }, + { + "episode": 30208, + "epoch": 0.542977316029766, + "loss/policy_avg": 0.2918689250946045, + "lr": 2.638228527607362e-06, + "objective/entropy": 101.60655212402344, + "objective/kl": 18.337299346923828, + "objective/non_score_reward": -1.8337297439575195, + "objective/rlhf_reward": -5.673059051454651, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 104.64295196533203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4675699472427368, + "step": 1887, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995253086090088 + }, + { + "episode": 30224, + "epoch": 0.5432649099471546, + "loss/policy_avg": 0.018769599497318268, + "lr": 2.638036809815951e-06, + "objective/entropy": 24.645986557006836, + "objective/kl": 19.31072235107422, + "objective/non_score_reward": -1.9310722351074219, + "objective/rlhf_reward": -7.324289298057556, + "objective/scores": 0.1, + "policy/approxkl_avg": 69.67178344726562, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.47627758979797363, + "step": 1888, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0013792514801025 + }, + { + "episode": 30240, + "epoch": 0.5435525038645432, + "loss/policy_avg": 0.15706397593021393, + "lr": 2.63784509202454e-06, + "objective/entropy": -278.9888916015625, + "objective/kl": 1.9181170463562012, + "objective/non_score_reward": -0.19181174039840698, + "objective/rlhf_reward": 2.1564720824945245, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 16.385345458984375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6495269536972046, + "step": 1889, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 10, + "val/ratio": 1.99857497215271 + }, + { + "episode": 30256, + "epoch": 0.5438400977819319, + "loss/policy_avg": -0.10978720337152481, + "lr": 2.637653374233129e-06, + "objective/entropy": 221.46072387695312, + "objective/kl": 18.129825592041016, + "objective/non_score_reward": -1.8129827976226807, + "objective/rlhf_reward": -6.851930877566337, + "objective/scores": 0.1, + "policy/approxkl_avg": 98.64704895019531, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5976404547691345, + "step": 1890, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0071773529052734 + }, + { + "episode": 30272, + "epoch": 0.5441276916993205, + "loss/policy_avg": 0.5263071060180664, + "lr": 2.637461656441718e-06, + "objective/entropy": -24.288192749023438, + "objective/kl": 13.550774574279785, + "objective/non_score_reward": -1.3550773859024048, + "objective/rlhf_reward": -2.496590469719145, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 25.168991088867188, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7016432285308838, + "step": 1891, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9976330995559692 + }, + { + "episode": 30288, + "epoch": 0.5444152856167092, + "loss/policy_avg": 0.14902125298976898, + "lr": 2.6372699386503067e-06, + "objective/entropy": 19.211254119873047, + "objective/kl": 12.499555587768555, + "objective/non_score_reward": -1.249955654144287, + "objective/rlhf_reward": -4.599822735786438, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.2693653106689453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4941726326942444, + "step": 1892, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993488788604736 + }, + { + "episode": 30304, + "epoch": 0.5447028795340979, + "loss/policy_avg": 1.661439061164856, + "lr": 2.637078220858896e-06, + "objective/entropy": -65.19476318359375, + "objective/kl": 14.480757713317871, + "objective/non_score_reward": -1.4480756521224976, + "objective/rlhf_reward": -3.3923026382923123, + "objective/scores": 0.6, + "policy/approxkl_avg": 107.93418884277344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6490131616592407, + "step": 1893, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9979932308197021 + }, + { + "episode": 30320, + "epoch": 0.5449904734514865, + "loss/policy_avg": 0.3524066209793091, + "lr": 2.6368865030674848e-06, + "objective/entropy": -170.36708068847656, + "objective/kl": 16.79137420654297, + "objective/non_score_reward": -1.6791372299194336, + "objective/rlhf_reward": -6.316548681259155, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.46489715576172, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.507827877998352, + "step": 1894, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9996378421783447 + }, + { + "episode": 30336, + "epoch": 0.5452780673688752, + "loss/policy_avg": 0.3015563488006592, + "lr": 2.6366947852760736e-06, + "objective/entropy": 7.907737731933594, + "objective/kl": 11.733333587646484, + "objective/non_score_reward": -1.1733335256576538, + "objective/rlhf_reward": -4.293333983421325, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.35322952270508, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6082165241241455, + "step": 1895, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987642765045166 + }, + { + "episode": 30352, + "epoch": 0.5455656612862638, + "loss/policy_avg": -0.0767693892121315, + "lr": 2.636503067484663e-06, + "objective/entropy": 42.51039505004883, + "objective/kl": 14.21924114227295, + "objective/non_score_reward": -1.421924114227295, + "objective/rlhf_reward": -2.7639773234140605, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.097938537597656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4579886794090271, + "step": 1896, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000124931335449 + }, + { + "episode": 30368, + "epoch": 0.5458532552036525, + "loss/policy_avg": 0.009962936863303185, + "lr": 2.6363113496932516e-06, + "objective/entropy": 119.24744415283203, + "objective/kl": 6.005527496337891, + "objective/non_score_reward": -0.6005527973175049, + "objective/rlhf_reward": 1.997788900136948, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.5343382358551025, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5637143850326538, + "step": 1897, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001119613647461 + }, + { + "episode": 30384, + "epoch": 0.5461408491210411, + "loss/policy_avg": 0.3311734199523926, + "lr": 2.636119631901841e-06, + "objective/entropy": 139.10549926757812, + "objective/kl": 11.098630905151367, + "objective/non_score_reward": -1.109863042831421, + "objective/rlhf_reward": -0.03945221602916682, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.385029792785645, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.596035361289978, + "step": 1898, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0025429725646973 + }, + { + "episode": 30400, + "epoch": 0.5464284430384297, + "loss/policy_avg": 0.14937268197536469, + "lr": 2.6359279141104297e-06, + "objective/entropy": -126.49579620361328, + "objective/kl": 13.532024383544922, + "objective/non_score_reward": -1.3532025814056396, + "objective/rlhf_reward": -1.0128101207315918, + "objective/scores": 1.1, + "policy/approxkl_avg": 56.03593444824219, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5827751159667969, + "step": 1899, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989010095596313 + }, + { + "episode": 30416, + "epoch": 0.5467160369558184, + "loss/policy_avg": 0.2829972207546234, + "lr": 2.6357361963190185e-06, + "objective/entropy": 216.04718017578125, + "objective/kl": 13.735719680786133, + "objective/non_score_reward": -1.3735718727111816, + "objective/rlhf_reward": -3.546876351313527, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 50.316139221191406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5368937253952026, + "step": 1900, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997572660446167 + }, + { + "episode": 30432, + "epoch": 0.547003630873207, + "loss/policy_avg": 0.17570659518241882, + "lr": 2.6355444785276073e-06, + "objective/entropy": 354.5912780761719, + "objective/kl": 16.02384376525879, + "objective/non_score_reward": -1.6023844480514526, + "objective/rlhf_reward": -6.009537851810455, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.36066246032715, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8833104372024536, + "step": 1901, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990440607070923 + }, + { + "episode": 30448, + "epoch": 0.5472912247905957, + "loss/policy_avg": 0.4607427418231964, + "lr": 2.635352760736196e-06, + "objective/entropy": 20.947402954101562, + "objective/kl": 14.86680793762207, + "objective/non_score_reward": -1.4866809844970703, + "objective/rlhf_reward": -3.5467239081859585, + "objective/scores": 0.6, + "policy/approxkl_avg": 58.469356536865234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6606134176254272, + "step": 1902, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991655349731445 + }, + { + "episode": 30464, + "epoch": 0.5475788187079843, + "loss/policy_avg": 0.5278885364532471, + "lr": 2.6351610429447853e-06, + "objective/entropy": 65.04945373535156, + "objective/kl": 13.691702842712402, + "objective/non_score_reward": -1.3691703081130981, + "objective/rlhf_reward": -5.076681351661682, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.13874816894531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5448737144470215, + "step": 1903, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996506690979004 + }, + { + "episode": 30480, + "epoch": 0.547866412625373, + "loss/policy_avg": 0.1716153770685196, + "lr": 2.634969325153374e-06, + "objective/entropy": 271.1617126464844, + "objective/kl": 19.076047897338867, + "objective/non_score_reward": -1.9076049327850342, + "objective/rlhf_reward": -9.630420684814453, + "objective/scores": -0.5, + "policy/approxkl_avg": 192.4165496826172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6272881031036377, + "step": 1904, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973095655441284 + }, + { + "episode": 30496, + "epoch": 0.5481540065427616, + "loss/policy_avg": 0.16718728840351105, + "lr": 2.634777607361963e-06, + "objective/entropy": 56.779449462890625, + "objective/kl": 16.912702560424805, + "objective/non_score_reward": -1.691270351409912, + "objective/rlhf_reward": -6.365081644058227, + "objective/scores": 0.1, + "policy/approxkl_avg": 58.04198455810547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6575337648391724, + "step": 1905, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.00191593170166 + }, + { + "episode": 30512, + "epoch": 0.5484416004601502, + "loss/policy_avg": 0.02517540752887726, + "lr": 2.634585889570552e-06, + "objective/entropy": -61.73188018798828, + "objective/kl": 8.17990493774414, + "objective/non_score_reward": -0.8179904222488403, + "objective/rlhf_reward": -0.3482427566659183, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 21.236021041870117, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5999727249145508, + "step": 1906, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999282956123352 + }, + { + "episode": 30528, + "epoch": 0.5487291943775389, + "loss/policy_avg": 0.11145927011966705, + "lr": 2.634394171779141e-06, + "objective/entropy": -360.6292419433594, + "objective/kl": 15.74618911743164, + "objective/non_score_reward": -1.5746190547943115, + "objective/rlhf_reward": -5.898476040363311, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.0206298828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8304791450500488, + "step": 1907, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9990675449371338 + }, + { + "episode": 30544, + "epoch": 0.5490167882949276, + "loss/policy_avg": 0.17270202934741974, + "lr": 2.6342024539877302e-06, + "objective/entropy": 256.72802734375, + "objective/kl": 20.78005599975586, + "objective/non_score_reward": -2.07800555229187, + "objective/rlhf_reward": -7.912022566795349, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.87220573425293, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6917515993118286, + "step": 1908, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00004506111145 + }, + { + "episode": 30560, + "epoch": 0.5493043822123163, + "loss/policy_avg": 0.13270515203475952, + "lr": 2.634010736196319e-06, + "objective/entropy": 116.8352279663086, + "objective/kl": 15.580759048461914, + "objective/non_score_reward": -1.5580757856369019, + "objective/rlhf_reward": -5.832303127646446, + "objective/scores": 0.1, + "policy/approxkl_avg": 53.574005126953125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6051105856895447, + "step": 1909, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993774890899658 + }, + { + "episode": 30576, + "epoch": 0.5495919761297049, + "loss/policy_avg": 0.5251754522323608, + "lr": 2.633819018404908e-06, + "objective/entropy": -125.42312622070312, + "objective/kl": 14.342058181762695, + "objective/non_score_reward": -1.4342057704925537, + "objective/rlhf_reward": -3.3368228435516354, + "objective/scores": 0.6, + "policy/approxkl_avg": 7.771772861480713, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5477792024612427, + "step": 1910, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0014166831970215 + }, + { + "episode": 30592, + "epoch": 0.5498795700470935, + "loss/policy_avg": 0.18452884256839752, + "lr": 2.633627300613497e-06, + "objective/entropy": 240.2740020751953, + "objective/kl": 12.670340538024902, + "objective/non_score_reward": -1.2670340538024902, + "objective/rlhf_reward": -4.668136155605316, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.453449010848999, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6076258420944214, + "step": 1911, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000457763671875 + }, + { + "episode": 30608, + "epoch": 0.5501671639644822, + "loss/policy_avg": 0.12677043676376343, + "lr": 2.633435582822086e-06, + "objective/entropy": 31.767406463623047, + "objective/kl": 11.432640075683594, + "objective/non_score_reward": -1.1432639360427856, + "objective/rlhf_reward": -6.573055744171143, + "objective/scores": -0.5, + "policy/approxkl_avg": 34.6868896484375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5203061699867249, + "step": 1912, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0009207725524902 + }, + { + "episode": 30624, + "epoch": 0.5504547578818708, + "loss/policy_avg": 0.47530901432037354, + "lr": 2.633243865030675e-06, + "objective/entropy": 224.4400634765625, + "objective/kl": 12.745110511779785, + "objective/non_score_reward": -1.2745109796524048, + "objective/rlhf_reward": -7.098043918609619, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.56214904785156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5925279855728149, + "step": 1913, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000298023223877 + }, + { + "episode": 30640, + "epoch": 0.5507423517992595, + "loss/policy_avg": 0.029194243252277374, + "lr": 2.633052147239264e-06, + "objective/entropy": 156.1571044921875, + "objective/kl": 8.93233871459961, + "objective/non_score_reward": -0.8932338953018188, + "objective/rlhf_reward": -3.172935730218887, + "objective/scores": 0.1, + "policy/approxkl_avg": 17.79572296142578, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7106249332427979, + "step": 1914, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0002284049987793 + }, + { + "episode": 30656, + "epoch": 0.5510299457166481, + "loss/policy_avg": 0.03485263139009476, + "lr": 2.6328604294478527e-06, + "objective/entropy": 230.6031036376953, + "objective/kl": 13.536725997924805, + "objective/non_score_reward": -1.3536725044250488, + "objective/rlhf_reward": -5.014690196514129, + "objective/scores": 0.1, + "policy/approxkl_avg": 56.59600067138672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5597454309463501, + "step": 1915, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974443912506104 + }, + { + "episode": 30672, + "epoch": 0.5513175396340367, + "loss/policy_avg": 0.2497161328792572, + "lr": 2.632668711656442e-06, + "objective/entropy": -96.27470397949219, + "objective/kl": 12.014034271240234, + "objective/non_score_reward": -1.2014034986495972, + "objective/rlhf_reward": -1.8818948611032693, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.185368776321411, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5934675931930542, + "step": 1916, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.004218101501465 + }, + { + "episode": 30688, + "epoch": 0.5516051335514254, + "loss/policy_avg": 0.36477264761924744, + "lr": 2.6324769938650308e-06, + "objective/entropy": 210.66531372070312, + "objective/kl": 12.38323974609375, + "objective/non_score_reward": -1.2383239269256592, + "objective/rlhf_reward": -6.953295707702637, + "objective/scores": -0.5, + "policy/approxkl_avg": 45.72761154174805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5291770696640015, + "step": 1917, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988799095153809 + }, + { + "episode": 30704, + "epoch": 0.551892727468814, + "loss/policy_avg": 0.4472461938858032, + "lr": 2.6322852760736196e-06, + "objective/entropy": 104.30414581298828, + "objective/kl": 13.77920913696289, + "objective/non_score_reward": -1.3779209852218628, + "objective/rlhf_reward": -5.111683821678161, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.586515426635742, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5660973787307739, + "step": 1918, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980164766311646 + }, + { + "episode": 30720, + "epoch": 0.5521803213862027, + "loss/policy_avg": 0.37102967500686646, + "lr": 2.632093558282209e-06, + "objective/entropy": 60.02656936645508, + "objective/kl": 10.058416366577148, + "objective/non_score_reward": -1.0058417320251465, + "objective/rlhf_reward": -6.023366928100586, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.679643630981445, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7811880111694336, + "step": 1919, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0008389949798584 + }, + { + "episode": 30736, + "epoch": 0.5524679153035913, + "loss/policy_avg": 0.21096105873584747, + "lr": 2.6319018404907976e-06, + "objective/entropy": -134.3823699951172, + "objective/kl": 13.197624206542969, + "objective/non_score_reward": -1.3197624683380127, + "objective/rlhf_reward": -2.355330799461576, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 45.305763244628906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.47559309005737305, + "step": 1920, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9972378015518188 + }, + { + "episode": 30752, + "epoch": 0.5527555092209799, + "loss/policy_avg": 0.40054818987846375, + "lr": 2.631710122699387e-06, + "objective/entropy": 175.58895874023438, + "objective/kl": 17.43608856201172, + "objective/non_score_reward": -1.743609070777893, + "objective/rlhf_reward": -8.974435806274414, + "objective/scores": -0.5, + "policy/approxkl_avg": 59.900245666503906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8297168016433716, + "step": 1921, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998053789138794 + }, + { + "episode": 30768, + "epoch": 0.5530431031383686, + "loss/policy_avg": -0.3259451985359192, + "lr": 2.6315184049079757e-06, + "objective/entropy": -119.45405578613281, + "objective/kl": 16.492374420166016, + "objective/non_score_reward": -1.6492376327514648, + "objective/rlhf_reward": -8.59695053100586, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.74144744873047, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.831850528717041, + "step": 1922, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.998541235923767 + }, + { + "episode": 30784, + "epoch": 0.5533306970557573, + "loss/policy_avg": 1.059677243232727, + "lr": 2.6313266871165645e-06, + "objective/entropy": -17.18384552001953, + "objective/kl": 17.018753051757812, + "objective/non_score_reward": -1.7018752098083496, + "objective/rlhf_reward": -6.4075008690357205, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.392333984375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7476541996002197, + "step": 1923, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991600513458252 + }, + { + "episode": 30800, + "epoch": 0.553618290973146, + "loss/policy_avg": 0.5696789622306824, + "lr": 2.6311349693251533e-06, + "objective/entropy": 37.21489715576172, + "objective/kl": 17.65481948852539, + "objective/non_score_reward": -1.76548171043396, + "objective/rlhf_reward": -9.06192684173584, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.97014617919922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46043169498443604, + "step": 1924, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998187780380249 + }, + { + "episode": 30816, + "epoch": 0.5539058848905346, + "loss/policy_avg": 0.4576580822467804, + "lr": 2.630943251533742e-06, + "objective/entropy": 163.5697021484375, + "objective/kl": 19.395936965942383, + "objective/non_score_reward": -1.93959379196167, + "objective/rlhf_reward": -9.75837516784668, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.783203125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.633905827999115, + "step": 1925, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000224113464355 + }, + { + "episode": 30832, + "epoch": 0.5541934788079232, + "loss/policy_avg": 0.370788037776947, + "lr": 2.6307515337423313e-06, + "objective/entropy": -40.51841735839844, + "objective/kl": 12.110527038574219, + "objective/non_score_reward": -1.2110527753829956, + "objective/rlhf_reward": -2.8967998725938156, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 15.352380752563477, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6605029106140137, + "step": 1926, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997687816619873 + }, + { + "episode": 30848, + "epoch": 0.5544810727253119, + "loss/policy_avg": 0.5198642611503601, + "lr": 2.63055981595092e-06, + "objective/entropy": 123.91789245605469, + "objective/kl": 17.366943359375, + "objective/non_score_reward": -1.736694097518921, + "objective/rlhf_reward": -6.546776747703552, + "objective/scores": 0.1, + "policy/approxkl_avg": 54.66370391845703, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7496399879455566, + "step": 1927, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992420673370361 + }, + { + "episode": 30864, + "epoch": 0.5547686666427005, + "loss/policy_avg": 0.46590036153793335, + "lr": 2.6303680981595094e-06, + "objective/entropy": 179.61434936523438, + "objective/kl": 12.078149795532227, + "objective/non_score_reward": -1.2078149318695068, + "objective/rlhf_reward": -2.708553614393745, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 37.074127197265625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6669144630432129, + "step": 1928, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998729228973389 + }, + { + "episode": 30880, + "epoch": 0.5550562605600892, + "loss/policy_avg": 0.23146501183509827, + "lr": 2.630176380368098e-06, + "objective/entropy": -43.257476806640625, + "objective/kl": 10.434656143188477, + "objective/non_score_reward": -1.0434656143188477, + "objective/rlhf_reward": -2.512002890527831, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 14.093836784362793, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6398032903671265, + "step": 1929, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998741149902344 + }, + { + "episode": 30896, + "epoch": 0.5553438544774778, + "loss/policy_avg": -0.3979191780090332, + "lr": 2.629984662576687e-06, + "objective/entropy": 85.81379699707031, + "objective/kl": 13.083731651306152, + "objective/non_score_reward": -1.308373212814331, + "objective/rlhf_reward": -3.408664400848459, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 16.67183494567871, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5375758409500122, + "step": 1930, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0003044605255127 + }, + { + "episode": 30912, + "epoch": 0.5556314483948664, + "loss/policy_avg": 0.4099561870098114, + "lr": 2.6297929447852762e-06, + "objective/entropy": -108.34793853759766, + "objective/kl": 19.073108673095703, + "objective/non_score_reward": -1.9073107242584229, + "objective/rlhf_reward": -6.025123406116085, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 47.74517822265625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6598101854324341, + "step": 1931, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0001935958862305 + }, + { + "episode": 30928, + "epoch": 0.5559190423122551, + "loss/policy_avg": 0.8242303729057312, + "lr": 2.629601226993865e-06, + "objective/entropy": 56.59760284423828, + "objective/kl": 17.36560821533203, + "objective/non_score_reward": -1.7365608215332031, + "objective/rlhf_reward": -2.5462436139583584, + "objective/scores": 1.1, + "policy/approxkl_avg": 25.9139404296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4316277503967285, + "step": 1932, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991772174835205 + }, + { + "episode": 30944, + "epoch": 0.5562066362296437, + "loss/policy_avg": 1.5252747535705566, + "lr": 2.629409509202454e-06, + "objective/entropy": 9.067985534667969, + "objective/kl": 16.401121139526367, + "objective/non_score_reward": -1.6401121616363525, + "objective/rlhf_reward": -2.1604487061500546, + "objective/scores": 1.1, + "policy/approxkl_avg": 18.144981384277344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6066452264785767, + "step": 1933, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9995198249816895 + }, + { + "episode": 30960, + "epoch": 0.5564942301470324, + "loss/policy_avg": 0.16620782017707825, + "lr": 2.629217791411043e-06, + "objective/entropy": 20.33401107788086, + "objective/kl": 19.010009765625, + "objective/non_score_reward": -1.9010009765625, + "objective/rlhf_reward": -5.779175217422555, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 24.804540634155273, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5517880916595459, + "step": 1934, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990503787994385 + }, + { + "episode": 30976, + "epoch": 0.556781824064421, + "loss/policy_avg": 0.1389436423778534, + "lr": 2.629026073619632e-06, + "objective/entropy": -255.1221923828125, + "objective/kl": 11.334492683410645, + "objective/non_score_reward": -1.1334493160247803, + "objective/rlhf_reward": -4.133797264099121, + "objective/scores": 0.1, + "policy/approxkl_avg": 26.180503845214844, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6840099096298218, + "step": 1935, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9998408555984497 + }, + { + "episode": 30992, + "epoch": 0.5570694179818096, + "loss/policy_avg": 0.14210224151611328, + "lr": 2.628834355828221e-06, + "objective/entropy": 51.18537139892578, + "objective/kl": 15.396123886108398, + "objective/non_score_reward": -1.5396125316619873, + "objective/rlhf_reward": -1.758449977636337, + "objective/scores": 1.1, + "policy/approxkl_avg": 49.77583312988281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4005848169326782, + "step": 1936, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984276294708252 + }, + { + "episode": 31008, + "epoch": 0.5573570118991983, + "loss/policy_avg": 0.22125059366226196, + "lr": 2.62864263803681e-06, + "objective/entropy": 101.8367691040039, + "objective/kl": 12.313599586486816, + "objective/non_score_reward": -1.2313599586486816, + "objective/rlhf_reward": -6.925439834594727, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.22418212890625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.33145105838775635, + "step": 1937, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973564147949219 + }, + { + "episode": 31024, + "epoch": 0.5576446058165869, + "loss/policy_avg": 0.060771048069000244, + "lr": 2.6284509202453987e-06, + "objective/entropy": 197.3813934326172, + "objective/kl": 14.080726623535156, + "objective/non_score_reward": -1.4080727100372314, + "objective/rlhf_reward": -7.632290840148926, + "objective/scores": -0.5, + "policy/approxkl_avg": 18.806310653686523, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5231298804283142, + "step": 1938, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000919818878174 + }, + { + "episode": 31040, + "epoch": 0.5579321997339757, + "loss/policy_avg": 0.28798460960388184, + "lr": 2.628259202453988e-06, + "objective/entropy": -39.44812774658203, + "objective/kl": 10.068058967590332, + "objective/non_score_reward": -1.0068060159683228, + "objective/rlhf_reward": 0.37277611494064367, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.18744659423828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5737462639808655, + "step": 1939, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989917278289795 + }, + { + "episode": 31056, + "epoch": 0.5582197936513643, + "loss/policy_avg": 0.2492627054452896, + "lr": 2.6280674846625768e-06, + "objective/entropy": 301.3166198730469, + "objective/kl": 15.651753425598145, + "objective/non_score_reward": -1.5651755332946777, + "objective/rlhf_reward": -3.3369828208696575, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 38.802520751953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7086349129676819, + "step": 1940, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9971044063568115 + }, + { + "episode": 31072, + "epoch": 0.558507387568753, + "loss/policy_avg": 0.5365092754364014, + "lr": 2.627875766871166e-06, + "objective/entropy": -48.53523254394531, + "objective/kl": 16.297000885009766, + "objective/non_score_reward": -1.6297001838684082, + "objective/rlhf_reward": -2.118800616264343, + "objective/scores": 1.1, + "policy/approxkl_avg": 51.571128845214844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5370905995368958, + "step": 1941, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003647804260254 + }, + { + "episode": 31088, + "epoch": 0.5587949814861416, + "loss/policy_avg": 0.07891640812158585, + "lr": 2.627684049079755e-06, + "objective/entropy": -131.36619567871094, + "objective/kl": 15.51970100402832, + "objective/non_score_reward": -1.5519700050354004, + "objective/rlhf_reward": -4.085174175278221, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 234.3270721435547, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5256676077842712, + "step": 1942, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981193542480469 + }, + { + "episode": 31104, + "epoch": 0.5590825754035302, + "loss/policy_avg": 0.17335310578346252, + "lr": 2.6274923312883436e-06, + "objective/entropy": 43.700927734375, + "objective/kl": 18.097789764404297, + "objective/non_score_reward": -1.809779167175293, + "objective/rlhf_reward": -2.839116489887237, + "objective/scores": 1.1, + "policy/approxkl_avg": 15.332867622375488, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5896799564361572, + "step": 1943, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998621940612793 + }, + { + "episode": 31120, + "epoch": 0.5593701693209189, + "loss/policy_avg": 0.6577703952789307, + "lr": 2.627300613496933e-06, + "objective/entropy": -123.42121124267578, + "objective/kl": 13.580167770385742, + "objective/non_score_reward": -1.3580169677734375, + "objective/rlhf_reward": -2.508348677993986, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 18.91336441040039, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6367899775505066, + "step": 1944, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9985871315002441 + }, + { + "episode": 31136, + "epoch": 0.5596577632383075, + "loss/policy_avg": 0.207408145070076, + "lr": 2.6271088957055213e-06, + "objective/entropy": 51.933204650878906, + "objective/kl": 21.24639129638672, + "objective/non_score_reward": -2.1246392726898193, + "objective/rlhf_reward": -6.942297666278437, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 20.490079879760742, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7846795320510864, + "step": 1945, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000905990600586 + }, + { + "episode": 31152, + "epoch": 0.5599453571556962, + "loss/policy_avg": 0.3303542733192444, + "lr": 2.6269171779141105e-06, + "objective/entropy": 53.62565994262695, + "objective/kl": 11.193902015686035, + "objective/non_score_reward": -1.1193903684616089, + "objective/rlhf_reward": -2.744227902094523, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 3.3035974502563477, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4402533769607544, + "step": 1946, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998788833618164 + }, + { + "episode": 31168, + "epoch": 0.5602329510730848, + "loss/policy_avg": 0.5981451272964478, + "lr": 2.6267254601226993e-06, + "objective/entropy": 21.825145721435547, + "objective/kl": 14.38486385345459, + "objective/non_score_reward": -1.4384863376617432, + "objective/rlhf_reward": -1.3539454698562619, + "objective/scores": 1.1, + "policy/approxkl_avg": 37.79143524169922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5124356746673584, + "step": 1947, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9986077547073364 + }, + { + "episode": 31184, + "epoch": 0.5605205449904734, + "loss/policy_avg": -0.11670660972595215, + "lr": 2.626533742331288e-06, + "objective/entropy": 86.26981353759766, + "objective/kl": 18.290149688720703, + "objective/non_score_reward": -1.8290153741836548, + "objective/rlhf_reward": -6.916061496734619, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.349811553955078, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6505062580108643, + "step": 1948, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995582103729248 + }, + { + "episode": 31200, + "epoch": 0.5608081389078621, + "loss/policy_avg": 0.1440569907426834, + "lr": 2.6263420245398773e-06, + "objective/entropy": -76.61561584472656, + "objective/kl": 10.80213737487793, + "objective/non_score_reward": -1.0802137851715088, + "objective/rlhf_reward": -2.496026153835367, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 13.197822570800781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6293392181396484, + "step": 1949, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0017919540405273 + }, + { + "episode": 31216, + "epoch": 0.5610957328252507, + "loss/policy_avg": -0.1457497775554657, + "lr": 2.626150306748466e-06, + "objective/entropy": -66.14817810058594, + "objective/kl": 11.837576866149902, + "objective/non_score_reward": -1.1837577819824219, + "objective/rlhf_reward": -3.001697854200999, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 4.371072292327881, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.601198136806488, + "step": 1950, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993054866790771 + }, + { + "episode": 31232, + "epoch": 0.5613833267426394, + "loss/policy_avg": 0.08133930712938309, + "lr": 2.6259585889570554e-06, + "objective/entropy": 35.34192657470703, + "objective/kl": 16.1737003326416, + "objective/non_score_reward": -1.6173698902130127, + "objective/rlhf_reward": -3.5457607849848003, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 51.5657958984375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4502888321876526, + "step": 1951, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99739408493042 + }, + { + "episode": 31248, + "epoch": 0.561670920660028, + "loss/policy_avg": 0.5748536586761475, + "lr": 2.625766871165644e-06, + "objective/entropy": 277.04815673828125, + "objective/kl": 15.610389709472656, + "objective/non_score_reward": -1.5610390901565552, + "objective/rlhf_reward": -8.244155883789062, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.583017349243164, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7663341760635376, + "step": 1952, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974110126495361 + }, + { + "episode": 31264, + "epoch": 0.5619585145774166, + "loss/policy_avg": 0.40739142894744873, + "lr": 2.625575153374233e-06, + "objective/entropy": 44.04022979736328, + "objective/kl": 13.912942886352539, + "objective/non_score_reward": -1.391294240951538, + "objective/rlhf_reward": -2.6414577111017437, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 132.23989868164062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8223682641983032, + "step": 1953, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980732202529907 + }, + { + "episode": 31280, + "epoch": 0.5622461084948054, + "loss/policy_avg": 0.2794537842273712, + "lr": 2.6253834355828222e-06, + "objective/entropy": 52.798545837402344, + "objective/kl": 15.072214126586914, + "objective/non_score_reward": -1.5072214603424072, + "objective/rlhf_reward": -8.028885841369629, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.49165344238281, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7069833278656006, + "step": 1954, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987144470214844 + }, + { + "episode": 31296, + "epoch": 0.562533702412194, + "loss/policy_avg": 0.1479717493057251, + "lr": 2.625191717791411e-06, + "objective/entropy": 59.11641311645508, + "objective/kl": 20.31096649169922, + "objective/non_score_reward": -2.031096935272217, + "objective/rlhf_reward": -6.176975856499608, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 26.35702133178711, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.502386212348938, + "step": 1955, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998948097229004 + }, + { + "episode": 31312, + "epoch": 0.5628212963295827, + "loss/policy_avg": 0.1483973264694214, + "lr": 2.6250000000000003e-06, + "objective/entropy": 172.51075744628906, + "objective/kl": 14.092042922973633, + "objective/non_score_reward": -1.409204363822937, + "objective/rlhf_reward": -3.6894061667489364, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 43.896331787109375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5590693950653076, + "step": 1956, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002806186676025 + }, + { + "episode": 31328, + "epoch": 0.5631088902469713, + "loss/policy_avg": 0.7653570175170898, + "lr": 2.624808282208589e-06, + "objective/entropy": 128.2239532470703, + "objective/kl": 11.804733276367188, + "objective/non_score_reward": -1.1804733276367188, + "objective/rlhf_reward": -6.721893310546875, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.992034912109375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6958256363868713, + "step": 1957, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.003971576690674 + }, + { + "episode": 31344, + "epoch": 0.56339648416436, + "loss/policy_avg": -0.05574982613325119, + "lr": 2.624616564417178e-06, + "objective/entropy": -8.689697265625, + "objective/kl": 16.48877716064453, + "objective/non_score_reward": -1.6488778591156006, + "objective/rlhf_reward": -3.671792362571928, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 20.264589309692383, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.45056283473968506, + "step": 1958, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9995653629302979 + }, + { + "episode": 31360, + "epoch": 0.5636840780817486, + "loss/policy_avg": 0.15298953652381897, + "lr": 2.624424846625767e-06, + "objective/entropy": 50.52933120727539, + "objective/kl": 22.40363311767578, + "objective/non_score_reward": -2.240363597869873, + "objective/rlhf_reward": -10.961454391479492, + "objective/scores": -0.5, + "policy/approxkl_avg": 36.90216827392578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5374445915222168, + "step": 1959, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000500679016113 + }, + { + "episode": 31376, + "epoch": 0.5639716719991372, + "loss/policy_avg": 1.3337948322296143, + "lr": 2.624233128834356e-06, + "objective/entropy": 76.81420135498047, + "objective/kl": 13.708114624023438, + "objective/non_score_reward": -1.3708115816116333, + "objective/rlhf_reward": -5.083246028423309, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.27752685546875, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7236926555633545, + "step": 1960, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998272180557251 + }, + { + "episode": 31392, + "epoch": 0.5642592659165259, + "loss/policy_avg": 0.016748720780014992, + "lr": 2.6240414110429447e-06, + "objective/entropy": 227.30496215820312, + "objective/kl": 16.957714080810547, + "objective/non_score_reward": -1.6957714557647705, + "objective/rlhf_reward": -6.383085823059082, + "objective/scores": 0.1, + "policy/approxkl_avg": 53.046302795410156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5604294538497925, + "step": 1961, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976122379302979 + }, + { + "episode": 31408, + "epoch": 0.5645468598339145, + "loss/policy_avg": 0.45783132314682007, + "lr": 2.623849693251534e-06, + "objective/entropy": -48.12693786621094, + "objective/kl": 15.814268112182617, + "objective/non_score_reward": -1.5814268589019775, + "objective/rlhf_reward": -5.925707584619522, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.74335479736328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5133075714111328, + "step": 1962, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977269172668457 + }, + { + "episode": 31424, + "epoch": 0.5648344537513031, + "loss/policy_avg": -0.22306829690933228, + "lr": 2.623657975460123e-06, + "objective/entropy": 128.39199829101562, + "objective/kl": 10.377822875976562, + "objective/non_score_reward": -1.0377821922302246, + "objective/rlhf_reward": -3.751129066944122, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.1646804809570312, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.598357081413269, + "step": 1963, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0028271675109863 + }, + { + "episode": 31440, + "epoch": 0.5651220476686918, + "loss/policy_avg": 0.2750639021396637, + "lr": 2.623466257668712e-06, + "objective/entropy": 8.598228454589844, + "objective/kl": 8.15994644165039, + "objective/non_score_reward": -0.8159946203231812, + "objective/rlhf_reward": -5.263978481292725, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.306640625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.572208046913147, + "step": 1964, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998093843460083 + }, + { + "episode": 31456, + "epoch": 0.5654096415860804, + "loss/policy_avg": 0.4767600893974304, + "lr": 2.623274539877301e-06, + "objective/entropy": 110.88385009765625, + "objective/kl": 14.536544799804688, + "objective/non_score_reward": -1.453654408454895, + "objective/rlhf_reward": -3.4146176934242245, + "objective/scores": 0.6, + "policy/approxkl_avg": 44.370018005371094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7621891498565674, + "step": 1965, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984304904937744 + }, + { + "episode": 31472, + "epoch": 0.5656972355034691, + "loss/policy_avg": 0.6700938940048218, + "lr": 2.6230828220858896e-06, + "objective/entropy": 91.74417114257812, + "objective/kl": 16.03462791442871, + "objective/non_score_reward": -1.6034626960754395, + "objective/rlhf_reward": -3.4901316508066387, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 47.99883270263672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.536391019821167, + "step": 1966, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991071224212646 + }, + { + "episode": 31488, + "epoch": 0.5659848294208577, + "loss/policy_avg": 0.2143508791923523, + "lr": 2.622891104294479e-06, + "objective/entropy": 50.70976257324219, + "objective/kl": 13.917573928833008, + "objective/non_score_reward": -1.3917572498321533, + "objective/rlhf_reward": -1.1670292973518368, + "objective/scores": 1.1, + "policy/approxkl_avg": 34.19868850708008, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7093191742897034, + "step": 1967, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0008420944213867 + }, + { + "episode": 31504, + "epoch": 0.5662724233382463, + "loss/policy_avg": 0.27268171310424805, + "lr": 2.6226993865030673e-06, + "objective/entropy": 74.58323669433594, + "objective/kl": 13.26856803894043, + "objective/non_score_reward": -1.3268567323684692, + "objective/rlhf_reward": -7.307426929473877, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.578540802001953, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4392435550689697, + "step": 1968, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976592063903809 + }, + { + "episode": 31520, + "epoch": 0.5665600172556351, + "loss/policy_avg": 0.26809120178222656, + "lr": 2.6225076687116565e-06, + "objective/entropy": -37.09600067138672, + "objective/kl": 13.520483016967773, + "objective/non_score_reward": -1.352048397064209, + "objective/rlhf_reward": -3.008193230628967, + "objective/scores": 0.6, + "policy/approxkl_avg": 23.37268829345703, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.9211034774780273, + "step": 1969, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987969398498535 + }, + { + "episode": 31536, + "epoch": 0.5668476111730237, + "loss/policy_avg": 0.0321061909198761, + "lr": 2.6223159509202453e-06, + "objective/entropy": -193.0868682861328, + "objective/kl": 10.896371841430664, + "objective/non_score_reward": -1.089637041091919, + "objective/rlhf_reward": 0.0414515376091007, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.7421770095825195, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6736397743225098, + "step": 1970, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986076354980469 + }, + { + "episode": 31552, + "epoch": 0.5671352050904124, + "loss/policy_avg": 0.9158979654312134, + "lr": 2.622124233128834e-06, + "objective/entropy": 98.08502197265625, + "objective/kl": 11.713408470153809, + "objective/non_score_reward": -1.1713407039642334, + "objective/rlhf_reward": -4.285363173484802, + "objective/scores": 0.1, + "policy/approxkl_avg": 9.338741302490234, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6211938858032227, + "step": 1971, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001859188079834 + }, + { + "episode": 31568, + "epoch": 0.567422799007801, + "loss/policy_avg": 0.15321816504001617, + "lr": 2.6219325153374233e-06, + "objective/entropy": 57.19317626953125, + "objective/kl": 12.833736419677734, + "objective/non_score_reward": -1.2833735942840576, + "objective/rlhf_reward": -4.73349437713623, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.374547958374023, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8906912803649902, + "step": 1972, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997797012329102 + }, + { + "episode": 31584, + "epoch": 0.5677103929251897, + "loss/policy_avg": 0.04171551764011383, + "lr": 2.621740797546012e-06, + "objective/entropy": -98.7196044921875, + "objective/kl": 12.432456016540527, + "objective/non_score_reward": -1.2432456016540527, + "objective/rlhf_reward": -4.572982108592987, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.483003616333008, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6093274354934692, + "step": 1973, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983088970184326 + }, + { + "episode": 31600, + "epoch": 0.5679979868425783, + "loss/policy_avg": 0.2402482032775879, + "lr": 2.6215490797546014e-06, + "objective/entropy": 67.22429656982422, + "objective/kl": 19.42490005493164, + "objective/non_score_reward": -1.9424902200698853, + "objective/rlhf_reward": -6.21370169421728, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 28.987808227539062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6655051708221436, + "step": 1974, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978091716766357 + }, + { + "episode": 31616, + "epoch": 0.5682855807599669, + "loss/policy_avg": -0.026089750230312347, + "lr": 2.62135736196319e-06, + "objective/entropy": 158.90890502929688, + "objective/kl": 20.56407928466797, + "objective/non_score_reward": -2.056407928466797, + "objective/rlhf_reward": -10.225631713867188, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.276256561279297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6126078367233276, + "step": 1975, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00205397605896 + }, + { + "episode": 31632, + "epoch": 0.5685731746773556, + "loss/policy_avg": 0.38028374314308167, + "lr": 2.621165644171779e-06, + "objective/entropy": 239.01248168945312, + "objective/kl": 13.714025497436523, + "objective/non_score_reward": -1.371402621269226, + "objective/rlhf_reward": -5.0856106042861935, + "objective/scores": 0.1, + "policy/approxkl_avg": 54.81404495239258, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5809019804000854, + "step": 1976, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9958641529083252 + }, + { + "episode": 31648, + "epoch": 0.5688607685947442, + "loss/policy_avg": -0.05175406485795975, + "lr": 2.6209739263803682e-06, + "objective/entropy": 124.25830078125, + "objective/kl": 10.488260269165039, + "objective/non_score_reward": -1.048825979232788, + "objective/rlhf_reward": -1.2715851708662238, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.1010313034057617, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8621615171432495, + "step": 1977, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0017189979553223 + }, + { + "episode": 31664, + "epoch": 0.5691483625121329, + "loss/policy_avg": 0.26172560453414917, + "lr": 2.620782208588957e-06, + "objective/entropy": -3.961435317993164, + "objective/kl": 15.588273048400879, + "objective/non_score_reward": -1.5588274002075195, + "objective/rlhf_reward": -5.835309481620788, + "objective/scores": 0.1, + "policy/approxkl_avg": 8.479052543640137, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5890230536460876, + "step": 1978, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999295949935913 + }, + { + "episode": 31680, + "epoch": 0.5694359564295215, + "loss/policy_avg": 0.07885098457336426, + "lr": 2.6205904907975463e-06, + "objective/entropy": 163.91082763671875, + "objective/kl": 15.549473762512207, + "objective/non_score_reward": -1.5549473762512207, + "objective/rlhf_reward": -3.8197897434234616, + "objective/scores": 0.6, + "policy/approxkl_avg": 12.931072235107422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7356927394866943, + "step": 1979, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9969947338104248 + }, + { + "episode": 31696, + "epoch": 0.5697235503469101, + "loss/policy_avg": 0.3573415279388428, + "lr": 2.620398773006135e-06, + "objective/entropy": -116.21089935302734, + "objective/kl": 13.6551513671875, + "objective/non_score_reward": -1.3655149936676025, + "objective/rlhf_reward": -5.062060168385505, + "objective/scores": 0.1, + "policy/approxkl_avg": 33.5490608215332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5857123136520386, + "step": 1980, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999571442604065 + }, + { + "episode": 31712, + "epoch": 0.5700111442642988, + "loss/policy_avg": -0.036694854497909546, + "lr": 2.620207055214724e-06, + "objective/entropy": 126.82040405273438, + "objective/kl": 8.93669319152832, + "objective/non_score_reward": -0.8936692476272583, + "objective/rlhf_reward": -5.574676990509033, + "objective/scores": -0.5, + "policy/approxkl_avg": 72.33108520507812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6285960078239441, + "step": 1981, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0009305477142334 + }, + { + "episode": 31728, + "epoch": 0.5702987381816874, + "loss/policy_avg": 0.2230103760957718, + "lr": 2.620015337423313e-06, + "objective/entropy": 83.2691879272461, + "objective/kl": 13.386994361877441, + "objective/non_score_reward": -1.338699460029602, + "objective/rlhf_reward": -3.5299691214886417, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 76.8690185546875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.49291151762008667, + "step": 1982, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000518798828125 + }, + { + "episode": 31744, + "epoch": 0.570586332099076, + "loss/policy_avg": 0.6935573220252991, + "lr": 2.619823619631902e-06, + "objective/entropy": 334.6748046875, + "objective/kl": 23.331436157226562, + "objective/non_score_reward": -2.333143711090088, + "objective/rlhf_reward": -8.932574546337127, + "objective/scores": 0.1, + "policy/approxkl_avg": 95.91483306884766, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.8640803694725037, + "step": 1983, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9957650899887085 + }, + { + "episode": 31760, + "epoch": 0.5708739260164648, + "loss/policy_avg": -0.06788041442632675, + "lr": 2.6196319018404908e-06, + "objective/entropy": -40.34531784057617, + "objective/kl": 10.183060646057129, + "objective/non_score_reward": -1.018306016921997, + "objective/rlhf_reward": -2.4113647095566852, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 17.93832015991211, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.506335973739624, + "step": 1984, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000466823577881 + }, + { + "episode": 31776, + "epoch": 0.5711615199338534, + "loss/policy_avg": 0.27647337317466736, + "lr": 2.61944018404908e-06, + "objective/entropy": 219.0719451904297, + "objective/kl": 15.476117134094238, + "objective/non_score_reward": -1.5476117134094238, + "objective/rlhf_reward": -5.790447032451629, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.48750686645508, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5134598016738892, + "step": 1985, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003304958343506 + }, + { + "episode": 31792, + "epoch": 0.5714491138512421, + "loss/policy_avg": 0.36243271827697754, + "lr": 2.619248466257669e-06, + "objective/entropy": -5.402244567871094, + "objective/kl": 13.373044967651367, + "objective/non_score_reward": -1.3373044729232788, + "objective/rlhf_reward": -7.349217891693115, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.055587768554688, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5952838659286499, + "step": 1986, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0000016689300537 + }, + { + "episode": 31808, + "epoch": 0.5717367077686307, + "loss/policy_avg": 0.030699964612722397, + "lr": 2.619056748466258e-06, + "objective/entropy": 235.13650512695312, + "objective/kl": 14.822487831115723, + "objective/non_score_reward": -1.4822489023208618, + "objective/rlhf_reward": -7.928995609283447, + "objective/scores": -0.5, + "policy/approxkl_avg": 64.26378631591797, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7725933790206909, + "step": 1987, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992258548736572 + }, + { + "episode": 31824, + "epoch": 0.5720243016860194, + "loss/policy_avg": 0.03670354560017586, + "lr": 2.618865030674847e-06, + "objective/entropy": -120.67810821533203, + "objective/kl": 13.513415336608887, + "objective/non_score_reward": -1.3513414859771729, + "objective/rlhf_reward": -7.405365943908691, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.002901077270508, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6766031980514526, + "step": 1988, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9997832775115967 + }, + { + "episode": 31840, + "epoch": 0.572311895603408, + "loss/policy_avg": 0.2261669933795929, + "lr": 2.6186733128834357e-06, + "objective/entropy": 183.0972442626953, + "objective/kl": 21.516197204589844, + "objective/non_score_reward": -2.1516199111938477, + "objective/rlhf_reward": -10.60647964477539, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.96692657470703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.43175703287124634, + "step": 1989, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977149963378906 + }, + { + "episode": 31856, + "epoch": 0.5725994895207966, + "loss/policy_avg": 0.533334493637085, + "lr": 2.6184815950920245e-06, + "objective/entropy": 176.357421875, + "objective/kl": 17.872817993164062, + "objective/non_score_reward": -1.7872819900512695, + "objective/rlhf_reward": -9.149127960205078, + "objective/scores": -0.5, + "policy/approxkl_avg": 185.1457061767578, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5334278345108032, + "step": 1990, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992812871932983 + }, + { + "episode": 31872, + "epoch": 0.5728870834381853, + "loss/policy_avg": 0.15894320607185364, + "lr": 2.6182898773006133e-06, + "objective/entropy": -197.00643920898438, + "objective/kl": 10.364501953125, + "objective/non_score_reward": -1.0364501476287842, + "objective/rlhf_reward": -6.145800590515137, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.997642517089844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6820682883262634, + "step": 1991, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9986462593078613 + }, + { + "episode": 31888, + "epoch": 0.5731746773555739, + "loss/policy_avg": 0.2844724953174591, + "lr": 2.6180981595092025e-06, + "objective/entropy": -84.8385238647461, + "objective/kl": 14.503667831420898, + "objective/non_score_reward": -1.450366735458374, + "objective/rlhf_reward": -3.678760858551536, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 28.349754333496094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6058744788169861, + "step": 1992, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997042179107666 + }, + { + "episode": 31904, + "epoch": 0.5734622712729626, + "loss/policy_avg": 0.5715179443359375, + "lr": 2.6179064417177913e-06, + "objective/entropy": -42.54454803466797, + "objective/kl": 17.865976333618164, + "objective/non_score_reward": -1.786597728729248, + "objective/rlhf_reward": -6.746390855312347, + "objective/scores": 0.1, + "policy/approxkl_avg": 28.264297485351562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4473644196987152, + "step": 1993, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976756572723389 + }, + { + "episode": 31920, + "epoch": 0.5737498651903512, + "loss/policy_avg": 0.45799100399017334, + "lr": 2.6177147239263805e-06, + "objective/entropy": -278.7918701171875, + "objective/kl": 8.003768920898438, + "objective/non_score_reward": -0.8003770112991333, + "objective/rlhf_reward": -1.5396485380536182, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 38.73306655883789, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.650258481502533, + "step": 1994, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.999009132385254 + }, + { + "episode": 31936, + "epoch": 0.5740374591077398, + "loss/policy_avg": -0.06823521852493286, + "lr": 2.6175230061349694e-06, + "objective/entropy": -70.7914810180664, + "objective/kl": 12.955906867980957, + "objective/non_score_reward": -1.2955907583236694, + "objective/rlhf_reward": -3.6261036982208044, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 24.642898559570312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4823078513145447, + "step": 1995, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.002995014190674 + }, + { + "episode": 31952, + "epoch": 0.5743250530251285, + "loss/policy_avg": -0.12416256964206696, + "lr": 2.617331288343558e-06, + "objective/entropy": -112.54239654541016, + "objective/kl": 3.643691062927246, + "objective/non_score_reward": -0.36436912417411804, + "objective/rlhf_reward": 0.9425235554575919, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.6883997917175293, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4229438304901123, + "step": 1996, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.001127004623413 + }, + { + "episode": 31968, + "epoch": 0.5746126469425171, + "loss/policy_avg": 0.8278412222862244, + "lr": 2.6171395705521474e-06, + "objective/entropy": -195.83651733398438, + "objective/kl": 14.30837631225586, + "objective/non_score_reward": -1.4308377504348755, + "objective/rlhf_reward": -3.898522193702768, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 101.39013671875, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5440574884414673, + "step": 1997, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9996654987335205 + }, + { + "episode": 31984, + "epoch": 0.5749002408599058, + "loss/policy_avg": 0.5108898282051086, + "lr": 2.616947852760736e-06, + "objective/entropy": 372.1040344238281, + "objective/kl": 18.92198944091797, + "objective/non_score_reward": -1.8921990394592285, + "objective/rlhf_reward": -7.1687962174415585, + "objective/scores": 0.1, + "policy/approxkl_avg": 18.65614128112793, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9018811583518982, + "step": 1998, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997493028640747 + }, + { + "episode": 32000, + "epoch": 0.5751878347772945, + "loss/policy_avg": 2.341435670852661, + "lr": 2.616756134969325e-06, + "objective/entropy": 200.54232788085938, + "objective/kl": 6.54392147064209, + "objective/non_score_reward": -0.6543921232223511, + "objective/rlhf_reward": -2.217568612098694, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.1673641204833984, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5942652225494385, + "step": 1999, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0033254623413086 + }, + { + "episode": 32016, + "epoch": 0.5754754286946832, + "loss/policy_avg": 0.32693976163864136, + "lr": 2.6165644171779142e-06, + "objective/entropy": 50.55780029296875, + "objective/kl": 11.196540832519531, + "objective/non_score_reward": -1.1196540594100952, + "objective/rlhf_reward": -1.554897238255712, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 25.62256622314453, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8175270557403564, + "step": 2000, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.996117115020752 + }, + { + "episode": 32032, + "epoch": 0.5757630226120718, + "loss/policy_avg": 0.523784875869751, + "lr": 2.616372699386503e-06, + "objective/entropy": 142.1219024658203, + "objective/kl": 19.365474700927734, + "objective/non_score_reward": -1.9365475177764893, + "objective/rlhf_reward": -6.012856737772623, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 87.06364440917969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5942108035087585, + "step": 2001, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976255893707275 + }, + { + "episode": 32048, + "epoch": 0.5760506165294604, + "loss/policy_avg": 0.37893959879875183, + "lr": 2.6161809815950923e-06, + "objective/entropy": -63.05537414550781, + "objective/kl": 14.981361389160156, + "objective/non_score_reward": -1.498136281967163, + "objective/rlhf_reward": -3.5925449490547177, + "objective/scores": 0.6, + "policy/approxkl_avg": 7.1709418296813965, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6707227230072021, + "step": 2002, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987297058105469 + }, + { + "episode": 32064, + "epoch": 0.5763382104468491, + "loss/policy_avg": 0.6728400588035583, + "lr": 2.615989263803681e-06, + "objective/entropy": -191.27810668945312, + "objective/kl": 12.201635360717773, + "objective/non_score_reward": -1.2201635837554932, + "objective/rlhf_reward": -2.4806545138359066, + "objective/scores": 0.6, + "policy/approxkl_avg": 37.39985656738281, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5896199941635132, + "step": 2003, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985238313674927 + }, + { + "episode": 32080, + "epoch": 0.5766258043642377, + "loss/policy_avg": -0.35722067952156067, + "lr": 2.61579754601227e-06, + "objective/entropy": 25.450927734375, + "objective/kl": 13.033873558044434, + "objective/non_score_reward": -1.3033874034881592, + "objective/rlhf_reward": -2.8135497927665707, + "objective/scores": 0.6, + "policy/approxkl_avg": 39.72835159301758, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.44449368119239807, + "step": 2004, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0003347396850586 + }, + { + "episode": 32096, + "epoch": 0.5769133982816264, + "loss/policy_avg": 0.43654775619506836, + "lr": 2.615605828220859e-06, + "objective/entropy": -39.03499984741211, + "objective/kl": 15.21171760559082, + "objective/non_score_reward": -1.521172046661377, + "objective/rlhf_reward": -5.684687769412994, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.998878479003906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7011727094650269, + "step": 2005, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9980759620666504 + }, + { + "episode": 32112, + "epoch": 0.577200992199015, + "loss/policy_avg": -0.0905875414609909, + "lr": 2.615414110429448e-06, + "objective/entropy": 184.36927795410156, + "objective/kl": 18.664302825927734, + "objective/non_score_reward": -1.866430401802063, + "objective/rlhf_reward": -5.343015732542549, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 12.670072555541992, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8122575283050537, + "step": 2006, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0021438598632812 + }, + { + "episode": 32128, + "epoch": 0.5774885861164036, + "loss/policy_avg": 0.4686034619808197, + "lr": 2.615222392638037e-06, + "objective/entropy": 6.0128631591796875, + "objective/kl": 15.820640563964844, + "objective/non_score_reward": -1.582064151763916, + "objective/rlhf_reward": -8.328256607055664, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.504484176635742, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6733450889587402, + "step": 2007, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997711420059204 + }, + { + "episode": 32144, + "epoch": 0.5777761800337923, + "loss/policy_avg": 0.046472251415252686, + "lr": 2.615030674846626e-06, + "objective/entropy": -84.07270050048828, + "objective/kl": 11.857654571533203, + "objective/non_score_reward": -1.1857655048370361, + "objective/rlhf_reward": -6.7430620193481445, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.025468826293945, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9546033143997192, + "step": 2008, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0003514289855957 + }, + { + "episode": 32160, + "epoch": 0.5780637739511809, + "loss/policy_avg": 0.0446510948240757, + "lr": 2.614838957055215e-06, + "objective/entropy": 38.73136901855469, + "objective/kl": 17.77886962890625, + "objective/non_score_reward": -1.7778868675231934, + "objective/rlhf_reward": -5.286718364032815, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 15.033638954162598, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5836319923400879, + "step": 2009, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000471591949463 + }, + { + "episode": 32176, + "epoch": 0.5783513678685696, + "loss/policy_avg": -0.23901139199733734, + "lr": 2.614647239263804e-06, + "objective/entropy": -46.67261505126953, + "objective/kl": 12.934122085571289, + "objective/non_score_reward": -1.293412208557129, + "objective/rlhf_reward": -3.0509426019349437, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 77.60780334472656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5740710496902466, + "step": 2010, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0050253868103027 + }, + { + "episode": 32192, + "epoch": 0.5786389617859582, + "loss/policy_avg": -0.44756045937538147, + "lr": 2.614455521472393e-06, + "objective/entropy": -230.7035369873047, + "objective/kl": 10.906891822814941, + "objective/non_score_reward": -1.0906891822814941, + "objective/rlhf_reward": -3.962757086753845, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.9308230876922607, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5085049867630005, + "step": 2011, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.002194404602051 + }, + { + "episode": 32208, + "epoch": 0.5789265557033468, + "loss/policy_avg": 0.5888445973396301, + "lr": 2.6142638036809817e-06, + "objective/entropy": 220.92822265625, + "objective/kl": 17.640647888183594, + "objective/non_score_reward": -1.7640647888183594, + "objective/rlhf_reward": -6.656258976459503, + "objective/scores": 0.1, + "policy/approxkl_avg": 26.095348358154297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.809324324131012, + "step": 2012, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0005431175231934 + }, + { + "episode": 32224, + "epoch": 0.5792141496207355, + "loss/policy_avg": 0.026016470044851303, + "lr": 2.6140720858895705e-06, + "objective/entropy": 299.5074462890625, + "objective/kl": 16.170055389404297, + "objective/non_score_reward": -1.617005705833435, + "objective/rlhf_reward": -6.06802282333374, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.101377487182617, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7076947689056396, + "step": 2013, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998183250427246 + }, + { + "episode": 32240, + "epoch": 0.5795017435381241, + "loss/policy_avg": 0.38306957483291626, + "lr": 2.6138803680981593e-06, + "objective/entropy": 248.753173828125, + "objective/kl": 9.927104949951172, + "objective/non_score_reward": -0.992710530757904, + "objective/rlhf_reward": -3.570842123031616, + "objective/scores": 0.1, + "policy/approxkl_avg": 52.19784927368164, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7442108392715454, + "step": 2014, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99979829788208 + }, + { + "episode": 32256, + "epoch": 0.5797893374555129, + "loss/policy_avg": 0.7347298860549927, + "lr": 2.6136886503067485e-06, + "objective/entropy": 264.7408447265625, + "objective/kl": 20.418472290039062, + "objective/non_score_reward": -2.041846990585327, + "objective/rlhf_reward": -10.167387962341309, + "objective/scores": -0.5, + "policy/approxkl_avg": 30.207717895507812, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8410416841506958, + "step": 2015, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9964735507965088 + }, + { + "episode": 32272, + "epoch": 0.5800769313729015, + "loss/policy_avg": 0.10777494311332703, + "lr": 2.6134969325153373e-06, + "objective/entropy": -33.255218505859375, + "objective/kl": 12.130520820617676, + "objective/non_score_reward": -1.2130520343780518, + "objective/rlhf_reward": -4.452207899093628, + "objective/scores": 0.1, + "policy/approxkl_avg": 58.70673370361328, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5788019895553589, + "step": 2016, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999755620956421 + }, + { + "episode": 32288, + "epoch": 0.5803645252902901, + "loss/policy_avg": 0.4941823482513428, + "lr": 2.6133052147239266e-06, + "objective/entropy": 166.5862579345703, + "objective/kl": 18.242103576660156, + "objective/non_score_reward": -1.824210286140442, + "objective/rlhf_reward": -2.896841204166412, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.646388053894043, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6881924271583557, + "step": 2017, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998159408569336 + }, + { + "episode": 32304, + "epoch": 0.5806521192076788, + "loss/policy_avg": -0.22474974393844604, + "lr": 2.6131134969325154e-06, + "objective/entropy": -193.62680053710938, + "objective/kl": 10.035699844360352, + "objective/non_score_reward": -1.0035700798034668, + "objective/rlhf_reward": -1.090561126114103, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.371432304382324, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5917261838912964, + "step": 2018, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0010905265808105 + }, + { + "episode": 32320, + "epoch": 0.5809397131250674, + "loss/policy_avg": 0.24477297067642212, + "lr": 2.612921779141104e-06, + "objective/entropy": -146.20303344726562, + "objective/kl": 13.633316040039062, + "objective/non_score_reward": -1.3633315563201904, + "objective/rlhf_reward": -7.453326225280762, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.155723571777344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.474772572517395, + "step": 2019, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979207515716553 + }, + { + "episode": 32336, + "epoch": 0.5812273070424561, + "loss/policy_avg": 0.1283385157585144, + "lr": 2.6127300613496934e-06, + "objective/entropy": -5.738563537597656, + "objective/kl": 20.078689575195312, + "objective/non_score_reward": -2.007869243621826, + "objective/rlhf_reward": -7.631476855278015, + "objective/scores": 0.1, + "policy/approxkl_avg": 46.95381164550781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6904228329658508, + "step": 2020, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993395805358887 + }, + { + "episode": 32352, + "epoch": 0.5815149009598447, + "loss/policy_avg": 0.26754477620124817, + "lr": 2.6125383435582822e-06, + "objective/entropy": 127.22401428222656, + "objective/kl": 13.649679183959961, + "objective/non_score_reward": -1.364967942237854, + "objective/rlhf_reward": -7.459871768951416, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.265398025512695, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.39020296931266785, + "step": 2021, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974396228790283 + }, + { + "episode": 32368, + "epoch": 0.5818024948772333, + "loss/policy_avg": 0.20494411885738373, + "lr": 2.612346625766871e-06, + "objective/entropy": 185.99484252929688, + "objective/kl": 11.199531555175781, + "objective/non_score_reward": -1.1199530363082886, + "objective/rlhf_reward": -4.0798122644424435, + "objective/scores": 0.1, + "policy/approxkl_avg": 26.743867874145508, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4661474823951721, + "step": 2022, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0036611557006836 + }, + { + "episode": 32384, + "epoch": 0.582090088794622, + "loss/policy_avg": -0.4422033429145813, + "lr": 2.6121549079754603e-06, + "objective/entropy": 106.64537048339844, + "objective/kl": 9.224336624145508, + "objective/non_score_reward": -0.9224337339401245, + "objective/rlhf_reward": -0.7660157426607339, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 3.0834896564483643, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5475035309791565, + "step": 2023, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0093464851379395 + }, + { + "episode": 32400, + "epoch": 0.5823776827120106, + "loss/policy_avg": 0.2676440179347992, + "lr": 2.611963190184049e-06, + "objective/entropy": 111.9757080078125, + "objective/kl": 17.51507568359375, + "objective/non_score_reward": -1.7515077590942383, + "objective/rlhf_reward": -9.006031036376953, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.372546195983887, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5439157485961914, + "step": 2024, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999245405197144 + }, + { + "episode": 32416, + "epoch": 0.5826652766293993, + "loss/policy_avg": 0.18945106863975525, + "lr": 2.6117714723926383e-06, + "objective/entropy": 183.9626007080078, + "objective/kl": 12.516935348510742, + "objective/non_score_reward": -1.2516934871673584, + "objective/rlhf_reward": -4.6067740380764, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.953580856323242, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6190844774246216, + "step": 2025, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008950233459473 + }, + { + "episode": 32432, + "epoch": 0.5829528705467879, + "loss/policy_avg": 0.09500116109848022, + "lr": 2.611579754601227e-06, + "objective/entropy": 75.4105453491211, + "objective/kl": 18.89362144470215, + "objective/non_score_reward": -1.8893619775772095, + "objective/rlhf_reward": -3.157448089122772, + "objective/scores": 1.1, + "policy/approxkl_avg": 45.4237060546875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4325343668460846, + "step": 2026, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984755516052246 + }, + { + "episode": 32448, + "epoch": 0.5832404644641765, + "loss/policy_avg": 0.6804702281951904, + "lr": 2.611388036809816e-06, + "objective/entropy": -103.95718383789062, + "objective/kl": 20.174652099609375, + "objective/non_score_reward": -2.017465353012085, + "objective/rlhf_reward": -3.6698614120483395, + "objective/scores": 1.1, + "policy/approxkl_avg": 23.145023345947266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5804574489593506, + "step": 2027, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.99713134765625 + }, + { + "episode": 32464, + "epoch": 0.5835280583815652, + "loss/policy_avg": 0.39008814096450806, + "lr": 2.611196319018405e-06, + "objective/entropy": 193.8568572998047, + "objective/kl": 10.51252555847168, + "objective/non_score_reward": -1.0512524843215942, + "objective/rlhf_reward": -2.471676708261172, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 48.12255859375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7742000818252563, + "step": 2028, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9971537590026855 + }, + { + "episode": 32480, + "epoch": 0.5838156522989538, + "loss/policy_avg": -0.27750644087791443, + "lr": 2.611004601226994e-06, + "objective/entropy": 0.22932052612304688, + "objective/kl": 12.089995384216309, + "objective/non_score_reward": -1.2089996337890625, + "objective/rlhf_reward": -2.4359982967376705, + "objective/scores": 0.6, + "policy/approxkl_avg": 17.11049461364746, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4802630543708801, + "step": 2029, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0016231536865234 + }, + { + "episode": 32496, + "epoch": 0.5841032462163426, + "loss/policy_avg": 0.12393666803836823, + "lr": 2.610812883435583e-06, + "objective/entropy": 98.4588623046875, + "objective/kl": 19.640907287597656, + "objective/non_score_reward": -1.9640907049179077, + "objective/rlhf_reward": -9.856363296508789, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.085240364074707, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.26065224409103394, + "step": 2030, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997912049293518 + }, + { + "episode": 32512, + "epoch": 0.5843908401337312, + "loss/policy_avg": -0.44430527091026306, + "lr": 2.610621165644172e-06, + "objective/entropy": 79.62841033935547, + "objective/kl": 12.913888931274414, + "objective/non_score_reward": -1.2913891077041626, + "objective/rlhf_reward": -2.7655563116073605, + "objective/scores": 0.6, + "policy/approxkl_avg": 25.055837631225586, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.44760024547576904, + "step": 2031, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002227544784546 + }, + { + "episode": 32528, + "epoch": 0.5846784340511199, + "loss/policy_avg": -0.6092654466629028, + "lr": 2.610429447852761e-06, + "objective/entropy": 88.92134094238281, + "objective/kl": 14.717385292053223, + "objective/non_score_reward": -1.471738338470459, + "objective/rlhf_reward": -7.886953353881836, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.71894836425781, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5509711503982544, + "step": 2032, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0035958290100098 + }, + { + "episode": 32544, + "epoch": 0.5849660279685085, + "loss/policy_avg": 0.23970016837120056, + "lr": 2.61023773006135e-06, + "objective/entropy": 152.98385620117188, + "objective/kl": 15.108621597290039, + "objective/non_score_reward": -1.510862112045288, + "objective/rlhf_reward": -3.9207423648991924, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 23.307903289794922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6641147136688232, + "step": 2033, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9983627796173096 + }, + { + "episode": 32560, + "epoch": 0.5852536218858971, + "loss/policy_avg": -0.2796148955821991, + "lr": 2.6100460122699384e-06, + "objective/entropy": 218.56283569335938, + "objective/kl": 13.127628326416016, + "objective/non_score_reward": -1.31276273727417, + "objective/rlhf_reward": -3.6947917928367406, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 4.39356803894043, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5682885646820068, + "step": 2034, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0047197341918945 + }, + { + "episode": 32576, + "epoch": 0.5855412158032858, + "loss/policy_avg": 0.4500732421875, + "lr": 2.6098542944785277e-06, + "objective/entropy": 165.1419677734375, + "objective/kl": 16.184364318847656, + "objective/non_score_reward": -1.618436336517334, + "objective/rlhf_reward": -6.073745763301849, + "objective/scores": 0.1, + "policy/approxkl_avg": 25.664451599121094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4045563340187073, + "step": 2035, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982361793518066 + }, + { + "episode": 32592, + "epoch": 0.5858288097206744, + "loss/policy_avg": 1.1184520721435547, + "lr": 2.6096625766871165e-06, + "objective/entropy": 74.55671691894531, + "objective/kl": 24.226789474487305, + "objective/non_score_reward": -2.4226789474487305, + "objective/rlhf_reward": -11.690715789794922, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.897789001464844, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5844389200210571, + "step": 2036, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001523494720459 + }, + { + "episode": 32608, + "epoch": 0.586116403638063, + "loss/policy_avg": -0.016746334731578827, + "lr": 2.6094708588957053e-06, + "objective/entropy": 117.7492904663086, + "objective/kl": 16.263198852539062, + "objective/non_score_reward": -1.6263197660446167, + "objective/rlhf_reward": -4.382573070303474, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 147.6581573486328, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9091885089874268, + "step": 2037, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.005404472351074 + }, + { + "episode": 32624, + "epoch": 0.5864039975554517, + "loss/policy_avg": 1.110663652420044, + "lr": 2.6092791411042945e-06, + "objective/entropy": 176.1644744873047, + "objective/kl": 15.09540843963623, + "objective/non_score_reward": -1.5095406770706177, + "objective/rlhf_reward": -8.038162231445312, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.064037322998047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7703660726547241, + "step": 2038, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991657733917236 + }, + { + "episode": 32640, + "epoch": 0.5866915914728403, + "loss/policy_avg": 0.08125920593738556, + "lr": 2.6090874233128833e-06, + "objective/entropy": 63.565513610839844, + "objective/kl": 14.566051483154297, + "objective/non_score_reward": -1.4566051959991455, + "objective/rlhf_reward": -3.8790097338723495, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 35.45296096801758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3615097105503082, + "step": 2039, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9950077533721924 + }, + { + "episode": 32656, + "epoch": 0.586979185390229, + "loss/policy_avg": 0.6111937165260315, + "lr": 2.6088957055214726e-06, + "objective/entropy": 24.75860595703125, + "objective/kl": 9.614357948303223, + "objective/non_score_reward": -0.9614357948303223, + "objective/rlhf_reward": -1.898332144098218, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 22.132753372192383, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.48295459151268005, + "step": 2040, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996814727783203 + }, + { + "episode": 32672, + "epoch": 0.5872667793076176, + "loss/policy_avg": 0.3595702648162842, + "lr": 2.6087039877300614e-06, + "objective/entropy": -32.59522247314453, + "objective/kl": 9.41091537475586, + "objective/non_score_reward": -0.9410915374755859, + "objective/rlhf_reward": 0.6356339693069462, + "objective/scores": 1.1, + "policy/approxkl_avg": 7.560471057891846, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46686244010925293, + "step": 2041, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0003132820129395 + }, + { + "episode": 32688, + "epoch": 0.5875543732250063, + "loss/policy_avg": 0.41997677087783813, + "lr": 2.60851226993865e-06, + "objective/entropy": 269.6581726074219, + "objective/kl": 15.583868980407715, + "objective/non_score_reward": -1.5583869218826294, + "objective/rlhf_reward": -3.8335475683212277, + "objective/scores": 0.6, + "policy/approxkl_avg": 26.866287231445312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7443997859954834, + "step": 2042, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0007753372192383 + }, + { + "episode": 32704, + "epoch": 0.5878419671423949, + "loss/policy_avg": -0.09083029627799988, + "lr": 2.6083205521472394e-06, + "objective/entropy": 225.44314575195312, + "objective/kl": 7.201667785644531, + "objective/non_score_reward": -0.7201669216156006, + "objective/rlhf_reward": 1.51933246254921, + "objective/scores": 1.1, + "policy/approxkl_avg": 0.43545225262641907, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5843120217323303, + "step": 2043, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004441976547241 + }, + { + "episode": 32720, + "epoch": 0.5881295610597835, + "loss/policy_avg": 0.22586756944656372, + "lr": 2.6081288343558282e-06, + "objective/entropy": 8.289024353027344, + "objective/kl": 22.944671630859375, + "objective/non_score_reward": -2.2944672107696533, + "objective/rlhf_reward": -7.353040213855813, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 92.32232666015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6938124895095825, + "step": 2044, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0006496906280518 + }, + { + "episode": 32736, + "epoch": 0.5884171549771723, + "loss/policy_avg": -0.015067555010318756, + "lr": 2.6079371165644175e-06, + "objective/entropy": -206.26466369628906, + "objective/kl": 14.04336929321289, + "objective/non_score_reward": -1.4043371677398682, + "objective/rlhf_reward": -1.2173484623432156, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.8251872062683105, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.613540530204773, + "step": 2045, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0013580322265625 + }, + { + "episode": 32752, + "epoch": 0.5887047488945609, + "loss/policy_avg": 0.14012257754802704, + "lr": 2.6077453987730063e-06, + "objective/entropy": -78.76776885986328, + "objective/kl": 18.866756439208984, + "objective/non_score_reward": -1.8866755962371826, + "objective/rlhf_reward": -9.54670238494873, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.896300792694092, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.9173866510391235, + "step": 2046, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9997925758361816 + }, + { + "episode": 32768, + "epoch": 0.5889923428119496, + "loss/policy_avg": 0.01659621298313141, + "lr": 2.607553680981595e-06, + "objective/entropy": -23.22461700439453, + "objective/kl": 12.49586009979248, + "objective/non_score_reward": -1.2495861053466797, + "objective/rlhf_reward": -4.598344361782074, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.9034366607666, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5299052000045776, + "step": 2047, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0000593662261963 + }, + { + "episode": 32784, + "epoch": 0.5892799367293382, + "loss/policy_avg": 0.2230335772037506, + "lr": 2.6073619631901843e-06, + "objective/entropy": -54.427371978759766, + "objective/kl": 16.49000358581543, + "objective/non_score_reward": -1.6490004062652588, + "objective/rlhf_reward": -4.862668232123056, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 39.52677917480469, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6089744567871094, + "step": 2048, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998667597770691 + }, + { + "episode": 32800, + "epoch": 0.5895675306467268, + "loss/policy_avg": 0.08745601773262024, + "lr": 2.607170245398773e-06, + "objective/entropy": -16.302215576171875, + "objective/kl": 19.69635009765625, + "objective/non_score_reward": -1.9696348905563354, + "objective/rlhf_reward": -5.478539860248565, + "objective/scores": 0.6, + "policy/approxkl_avg": 31.309520721435547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7404862642288208, + "step": 2049, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9993966817855835 + }, + { + "episode": 32816, + "epoch": 0.5898551245641155, + "loss/policy_avg": 0.2967149019241333, + "lr": 2.606978527607362e-06, + "objective/entropy": 202.35987854003906, + "objective/kl": 21.735036849975586, + "objective/non_score_reward": -2.173503875732422, + "objective/rlhf_reward": -8.294015264511108, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.59661293029785, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5977518558502197, + "step": 2050, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998718023300171 + }, + { + "episode": 32832, + "epoch": 0.5901427184815041, + "loss/policy_avg": -0.04524856433272362, + "lr": 2.606786809815951e-06, + "objective/entropy": 199.6927490234375, + "objective/kl": 12.524423599243164, + "objective/non_score_reward": -1.2524423599243164, + "objective/rlhf_reward": -2.086050663830015, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.180833339691162, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7892223596572876, + "step": 2051, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000894069671631 + }, + { + "episode": 32848, + "epoch": 0.5904303123988928, + "loss/policy_avg": 0.1637168824672699, + "lr": 2.60659509202454e-06, + "objective/entropy": -87.76792907714844, + "objective/kl": 11.287168502807617, + "objective/non_score_reward": -1.1287169456481934, + "objective/rlhf_reward": -4.11486736536026, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.6567301750183105, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6446081399917603, + "step": 2052, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000990390777588 + }, + { + "episode": 32864, + "epoch": 0.5907179063162814, + "loss/policy_avg": 0.6465239524841309, + "lr": 2.606403374233129e-06, + "objective/entropy": 207.52310180664062, + "objective/kl": 14.48089599609375, + "objective/non_score_reward": -1.448089599609375, + "objective/rlhf_reward": -5.392358517646789, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.36521911621094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.819767951965332, + "step": 2053, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989714622497559 + }, + { + "episode": 32880, + "epoch": 0.59100550023367, + "loss/policy_avg": 0.5887457132339478, + "lr": 2.606211656441718e-06, + "objective/entropy": -170.1577606201172, + "objective/kl": 13.745086669921875, + "objective/non_score_reward": -1.3745086193084717, + "objective/rlhf_reward": -5.098034194111824, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.378250122070312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4332253634929657, + "step": 2054, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001190185546875 + }, + { + "episode": 32896, + "epoch": 0.5912930941510587, + "loss/policy_avg": -0.0851367712020874, + "lr": 2.606019938650307e-06, + "objective/entropy": -175.6912384033203, + "objective/kl": 11.137053489685059, + "objective/non_score_reward": -1.1137053966522217, + "objective/rlhf_reward": -6.454821586608887, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.83671569824219, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.573056697845459, + "step": 2055, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0009331703186035 + }, + { + "episode": 32912, + "epoch": 0.5915806880684473, + "loss/policy_avg": 0.05345374345779419, + "lr": 2.605828220858896e-06, + "objective/entropy": 35.686378479003906, + "objective/kl": 15.517290115356445, + "objective/non_score_reward": -1.5517290830612183, + "objective/rlhf_reward": -1.8069164514541622, + "objective/scores": 1.1, + "policy/approxkl_avg": 61.70820999145508, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6552486419677734, + "step": 2056, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993808269500732 + }, + { + "episode": 32928, + "epoch": 0.591868281985836, + "loss/policy_avg": 0.23575273156166077, + "lr": 2.6056365030674844e-06, + "objective/entropy": -190.5808868408203, + "objective/kl": 16.56703758239746, + "objective/non_score_reward": -1.6567037105560303, + "objective/rlhf_reward": -6.226814544200897, + "objective/scores": 0.1, + "policy/approxkl_avg": 64.16632080078125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.47018104791641235, + "step": 2057, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9991967678070068 + }, + { + "episode": 32944, + "epoch": 0.5921558759032246, + "loss/policy_avg": 0.44772571325302124, + "lr": 2.6054447852760737e-06, + "objective/entropy": 51.64216995239258, + "objective/kl": 17.7371826171875, + "objective/non_score_reward": -1.7737181186676025, + "objective/rlhf_reward": -6.694872593879699, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.988739013671875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6093859672546387, + "step": 2058, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980683326721191 + }, + { + "episode": 32960, + "epoch": 0.5924434698206132, + "loss/policy_avg": 0.11212430894374847, + "lr": 2.6052530674846625e-06, + "objective/entropy": 202.07647705078125, + "objective/kl": 14.013270378112793, + "objective/non_score_reward": -1.4013270139694214, + "objective/rlhf_reward": -2.681589041591856, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 51.12955093383789, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6560755968093872, + "step": 2059, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973139762878418 + }, + { + "episode": 32976, + "epoch": 0.592731063738002, + "loss/policy_avg": 0.16852912306785583, + "lr": 2.6050613496932513e-06, + "objective/entropy": 158.34957885742188, + "objective/kl": 17.040546417236328, + "objective/non_score_reward": -1.7040544748306274, + "objective/rlhf_reward": -6.416217899322509, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.814979553222656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8345825672149658, + "step": 2060, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986486434936523 + }, + { + "episode": 32992, + "epoch": 0.5930186576553906, + "loss/policy_avg": 0.5923249125480652, + "lr": 2.6048696319018405e-06, + "objective/entropy": 77.84188079833984, + "objective/kl": 13.63203239440918, + "objective/non_score_reward": -1.3632032871246338, + "objective/rlhf_reward": -7.452813148498535, + "objective/scores": -0.5, + "policy/approxkl_avg": 71.78942108154297, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6449399590492249, + "step": 2061, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000882625579834 + }, + { + "episode": 33008, + "epoch": 0.5933062515727793, + "loss/policy_avg": 0.13562044501304626, + "lr": 2.6046779141104293e-06, + "objective/entropy": 22.380054473876953, + "objective/kl": 14.888589859008789, + "objective/non_score_reward": -1.488858938217163, + "objective/rlhf_reward": -4.008024792151387, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.43534779548645, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5684009790420532, + "step": 2062, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000018358230591 + }, + { + "episode": 33024, + "epoch": 0.5935938454901679, + "loss/policy_avg": -0.22385066747665405, + "lr": 2.6044861963190186e-06, + "objective/entropy": -12.513151168823242, + "objective/kl": 15.879064559936523, + "objective/non_score_reward": -1.5879064798355103, + "objective/rlhf_reward": -8.351625442504883, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.4901871085166931, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7548037767410278, + "step": 2063, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.002842426300049 + }, + { + "episode": 33040, + "epoch": 0.5938814394075566, + "loss/policy_avg": -0.4307985305786133, + "lr": 2.6042944785276074e-06, + "objective/entropy": 151.069091796875, + "objective/kl": 16.00432777404785, + "objective/non_score_reward": -1.6004326343536377, + "objective/rlhf_reward": -4.2790240667024, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 53.35271453857422, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.42856454849243164, + "step": 2064, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0085434913635254 + }, + { + "episode": 33056, + "epoch": 0.5941690333249452, + "loss/policy_avg": -0.3380109965801239, + "lr": 2.604102760736196e-06, + "objective/entropy": 298.4809265136719, + "objective/kl": 11.528972625732422, + "objective/non_score_reward": -1.1528971195220947, + "objective/rlhf_reward": -2.7867600872841587, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 62.940818786621094, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.983273983001709, + "step": 2065, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0027170181274414 + }, + { + "episode": 33072, + "epoch": 0.5944566272423338, + "loss/policy_avg": 0.20114445686340332, + "lr": 2.6039110429447854e-06, + "objective/entropy": 193.4998779296875, + "objective/kl": 15.039509773254395, + "objective/non_score_reward": -1.503950834274292, + "objective/rlhf_reward": -5.615803307294845, + "objective/scores": 0.1, + "policy/approxkl_avg": 107.57582092285156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5463466048240662, + "step": 2066, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00111722946167 + }, + { + "episode": 33088, + "epoch": 0.5947442211597225, + "loss/policy_avg": 0.43512579798698425, + "lr": 2.6037193251533742e-06, + "objective/entropy": 73.24812316894531, + "objective/kl": 22.267837524414062, + "objective/non_score_reward": -2.2267839908599854, + "objective/rlhf_reward": -8.507136023044586, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.75790405273438, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.835204005241394, + "step": 2067, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973831176757812 + }, + { + "episode": 33104, + "epoch": 0.5950318150771111, + "loss/policy_avg": 0.06888563930988312, + "lr": 2.6035276073619635e-06, + "objective/entropy": 232.14309692382812, + "objective/kl": 15.005265235900879, + "objective/non_score_reward": -1.5005265474319458, + "objective/rlhf_reward": -5.602106219530105, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.626739501953125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5669118165969849, + "step": 2068, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9966709613800049 + }, + { + "episode": 33120, + "epoch": 0.5953194089944998, + "loss/policy_avg": 0.2841304838657379, + "lr": 2.6033358895705523e-06, + "objective/entropy": -28.131656646728516, + "objective/kl": 15.817472457885742, + "objective/non_score_reward": -1.58174729347229, + "objective/rlhf_reward": -4.204282762781654, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 12.378704071044922, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7138805389404297, + "step": 2069, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970505237579346 + }, + { + "episode": 33136, + "epoch": 0.5956070029118884, + "loss/policy_avg": 0.0031527914106845856, + "lr": 2.603144171779141e-06, + "objective/entropy": 21.773712158203125, + "objective/kl": 10.743021011352539, + "objective/non_score_reward": -1.0743021965026855, + "objective/rlhf_reward": 0.10279151201248204, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.1652061939239502, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7505438327789307, + "step": 2070, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000839948654175 + }, + { + "episode": 33152, + "epoch": 0.595894596829277, + "loss/policy_avg": 0.1873525083065033, + "lr": 2.6029524539877303e-06, + "objective/entropy": 12.441814422607422, + "objective/kl": 18.700258255004883, + "objective/non_score_reward": -1.8700261116027832, + "objective/rlhf_reward": -7.080104267597198, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.97005271911621, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5343834161758423, + "step": 2071, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0008387565612793 + }, + { + "episode": 33168, + "epoch": 0.5961821907466657, + "loss/policy_avg": 0.5001279711723328, + "lr": 2.602760736196319e-06, + "objective/entropy": -264.6827392578125, + "objective/kl": 12.770312309265137, + "objective/non_score_reward": -1.2770311832427979, + "objective/rlhf_reward": -4.7081250011920925, + "objective/scores": 0.1, + "policy/approxkl_avg": 89.31438446044922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5510841608047485, + "step": 2072, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9999988079071045 + }, + { + "episode": 33184, + "epoch": 0.5964697846640543, + "loss/policy_avg": 0.15056979656219482, + "lr": 2.602569018404908e-06, + "objective/entropy": -190.62677001953125, + "objective/kl": 13.944238662719727, + "objective/non_score_reward": -1.3944240808486938, + "objective/rlhf_reward": -3.1776962041854855, + "objective/scores": 0.6, + "policy/approxkl_avg": 52.246315002441406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7502607107162476, + "step": 2073, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.996614933013916 + }, + { + "episode": 33200, + "epoch": 0.596757378581443, + "loss/policy_avg": 0.37330591678619385, + "lr": 2.602377300613497e-06, + "objective/entropy": 47.70793533325195, + "objective/kl": 14.258697509765625, + "objective/non_score_reward": -1.4258697032928467, + "objective/rlhf_reward": -5.3034790515899655, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.65910530090332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6878872513771057, + "step": 2074, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0026094913482666 + }, + { + "episode": 33216, + "epoch": 0.5970449724988316, + "loss/policy_avg": 0.08630452305078506, + "lr": 2.602185582822086e-06, + "objective/entropy": -24.456968307495117, + "objective/kl": 13.694202423095703, + "objective/non_score_reward": -1.3694202899932861, + "objective/rlhf_reward": -5.077681159973144, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.62160873413086, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6508252620697021, + "step": 2075, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9972835779190063 + }, + { + "episode": 33232, + "epoch": 0.5973325664162203, + "loss/policy_avg": 0.7143691778182983, + "lr": 2.601993865030675e-06, + "objective/entropy": 244.7336883544922, + "objective/kl": 15.322755813598633, + "objective/non_score_reward": -1.5322757959365845, + "objective/rlhf_reward": -4.006396951452766, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 27.416717529296875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9038101434707642, + "step": 2076, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971814155578613 + }, + { + "episode": 33248, + "epoch": 0.597620160333609, + "loss/policy_avg": 0.8709487915039062, + "lr": 2.601802147239264e-06, + "objective/entropy": 22.043338775634766, + "objective/kl": 13.561304092407227, + "objective/non_score_reward": -1.356130599975586, + "objective/rlhf_reward": -1.0245220422744747, + "objective/scores": 1.1, + "policy/approxkl_avg": 17.434871673583984, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5641729235649109, + "step": 2077, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974422454833984 + }, + { + "episode": 33264, + "epoch": 0.5979077542509976, + "loss/policy_avg": 0.02021685242652893, + "lr": 2.601610429447853e-06, + "objective/entropy": -15.377296447753906, + "objective/kl": 13.26352310180664, + "objective/non_score_reward": -1.3263523578643799, + "objective/rlhf_reward": -4.9054096102714535, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.072490692138672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.62602698802948, + "step": 2078, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0023858547210693 + }, + { + "episode": 33280, + "epoch": 0.5981953481683863, + "loss/policy_avg": 0.3721355199813843, + "lr": 2.6014187116564416e-06, + "objective/entropy": -88.40010070800781, + "objective/kl": 19.315494537353516, + "objective/non_score_reward": -1.9315496683120728, + "objective/rlhf_reward": -7.326198710501194, + "objective/scores": 0.1, + "policy/approxkl_avg": 127.0091552734375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8153165578842163, + "step": 2079, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9973070621490479 + }, + { + "episode": 33296, + "epoch": 0.5984829420857749, + "loss/policy_avg": 0.30539119243621826, + "lr": 2.6012269938650304e-06, + "objective/entropy": 47.699588775634766, + "objective/kl": 14.241891860961914, + "objective/non_score_reward": -1.4241890907287598, + "objective/rlhf_reward": -5.296756184101104, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.3573634624481201, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3682810068130493, + "step": 2080, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000620126724243 + }, + { + "episode": 33312, + "epoch": 0.5987705360031635, + "loss/policy_avg": 0.04993399232625961, + "lr": 2.6010352760736197e-06, + "objective/entropy": -33.56816101074219, + "objective/kl": 16.420448303222656, + "objective/non_score_reward": -1.6420449018478394, + "objective/rlhf_reward": -2.168179845809936, + "objective/scores": 1.1, + "policy/approxkl_avg": 29.767711639404297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6949491500854492, + "step": 2081, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982733726501465 + }, + { + "episode": 33328, + "epoch": 0.5990581299205522, + "loss/policy_avg": 0.12590837478637695, + "lr": 2.6008435582822085e-06, + "objective/entropy": 70.84333801269531, + "objective/kl": 18.988134384155273, + "objective/non_score_reward": -1.898813247680664, + "objective/rlhf_reward": -5.195253229141235, + "objective/scores": 0.6, + "policy/approxkl_avg": 52.74480438232422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5343348979949951, + "step": 2082, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998291015625 + }, + { + "episode": 33344, + "epoch": 0.5993457238379408, + "loss/policy_avg": 0.4130307734012604, + "lr": 2.6006518404907977e-06, + "objective/entropy": 143.94061279296875, + "objective/kl": 17.626792907714844, + "objective/non_score_reward": -1.7626793384552002, + "objective/rlhf_reward": -5.225888605388711, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 10.599780082702637, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5684624910354614, + "step": 2083, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001006841659546 + }, + { + "episode": 33360, + "epoch": 0.5996333177553295, + "loss/policy_avg": 0.35321566462516785, + "lr": 2.6004601226993865e-06, + "objective/entropy": 143.16958618164062, + "objective/kl": 15.126535415649414, + "objective/non_score_reward": -1.5126534700393677, + "objective/rlhf_reward": -4.103202651219304, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 11.734973907470703, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3501328229904175, + "step": 2084, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999567985534668 + }, + { + "episode": 33376, + "epoch": 0.5999209116727181, + "loss/policy_avg": 0.1641218066215515, + "lr": 2.6002684049079753e-06, + "objective/entropy": -69.33512115478516, + "objective/kl": 18.16229248046875, + "objective/non_score_reward": -1.8162293434143066, + "objective/rlhf_reward": -4.864917194843292, + "objective/scores": 0.6, + "policy/approxkl_avg": 48.57612991333008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7362520098686218, + "step": 2085, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9975570440292358 + }, + { + "episode": 33392, + "epoch": 0.6002085055901067, + "loss/policy_avg": 0.4311325252056122, + "lr": 2.6000766871165646e-06, + "objective/entropy": 76.26156616210938, + "objective/kl": 15.489213943481445, + "objective/non_score_reward": -1.5489213466644287, + "objective/rlhf_reward": -1.7956856250762936, + "objective/scores": 1.1, + "policy/approxkl_avg": 85.90757751464844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4286644756793976, + "step": 2086, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998852252960205 + }, + { + "episode": 33408, + "epoch": 0.6004960995074954, + "loss/policy_avg": 0.591722846031189, + "lr": 2.5998849693251534e-06, + "objective/entropy": 189.01287841796875, + "objective/kl": 19.79343032836914, + "objective/non_score_reward": -1.979343295097351, + "objective/rlhf_reward": -9.917373657226562, + "objective/scores": -0.5, + "policy/approxkl_avg": 44.156105041503906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8051648736000061, + "step": 2087, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000519037246704 + }, + { + "episode": 33424, + "epoch": 0.600783693424884, + "loss/policy_avg": 0.10551775991916656, + "lr": 2.599693251533742e-06, + "objective/entropy": 189.19497680664062, + "objective/kl": 17.67028045654297, + "objective/non_score_reward": -1.7670280933380127, + "objective/rlhf_reward": -4.945406439081703, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 30.548511505126953, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.47096550464630127, + "step": 2088, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998936653137207 + }, + { + "episode": 33440, + "epoch": 0.6010712873422727, + "loss/policy_avg": 0.00404435396194458, + "lr": 2.5995015337423314e-06, + "objective/entropy": 2.9050827026367188, + "objective/kl": 14.470444679260254, + "objective/non_score_reward": -1.4470446109771729, + "objective/rlhf_reward": -7.788178443908691, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.609698295593262, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4019898772239685, + "step": 2089, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999104619026184 + }, + { + "episode": 33456, + "epoch": 0.6013588812596613, + "loss/policy_avg": -0.2547715902328491, + "lr": 2.5993098159509202e-06, + "objective/entropy": 64.21862030029297, + "objective/kl": 15.499526023864746, + "objective/non_score_reward": -1.5499526262283325, + "objective/rlhf_reward": -8.199810981750488, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.8819937705993652, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.7023075222969055, + "step": 2090, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001739025115967 + }, + { + "episode": 33472, + "epoch": 0.60164647517705, + "loss/policy_avg": 0.21478338539600372, + "lr": 2.5991180981595095e-06, + "objective/entropy": -2.176471710205078, + "objective/kl": 9.324807167053223, + "objective/non_score_reward": -0.9324808120727539, + "objective/rlhf_reward": -3.3299232482910153, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.506996154785156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.37977907061576843, + "step": 2091, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980961084365845 + }, + { + "episode": 33488, + "epoch": 0.6019340690944387, + "loss/policy_avg": 0.3434308171272278, + "lr": 2.5989263803680983e-06, + "objective/entropy": -41.94181823730469, + "objective/kl": 14.127401351928711, + "objective/non_score_reward": -1.4127402305603027, + "objective/rlhf_reward": -1.250961190462112, + "objective/scores": 1.1, + "policy/approxkl_avg": 36.69062042236328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.31178468465805054, + "step": 2092, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998267650604248 + }, + { + "episode": 33504, + "epoch": 0.6022216630118273, + "loss/policy_avg": 0.012844249606132507, + "lr": 2.598734662576687e-06, + "objective/entropy": -70.4189224243164, + "objective/kl": 9.646286964416504, + "objective/non_score_reward": -0.9646286368370056, + "objective/rlhf_reward": -3.458514547348022, + "objective/scores": 0.1, + "policy/approxkl_avg": 9.505643844604492, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5593554973602295, + "step": 2093, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0008978843688965 + }, + { + "episode": 33520, + "epoch": 0.602509256929216, + "loss/policy_avg": 2.3791656494140625, + "lr": 2.5985429447852763e-06, + "objective/entropy": -168.18963623046875, + "objective/kl": 10.93196964263916, + "objective/non_score_reward": -1.0931968688964844, + "objective/rlhf_reward": -2.7109279088383778, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 10.194269180297852, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.639975368976593, + "step": 2094, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989033937454224 + }, + { + "episode": 33536, + "epoch": 0.6027968508466046, + "loss/policy_avg": 0.23867468535900116, + "lr": 2.598351226993865e-06, + "objective/entropy": 28.35572052001953, + "objective/kl": 17.075637817382812, + "objective/non_score_reward": -1.7075636386871338, + "objective/rlhf_reward": -2.4302548527717587, + "objective/scores": 1.1, + "policy/approxkl_avg": 29.440872192382812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6343339681625366, + "step": 2095, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9969229698181152 + }, + { + "episode": 33552, + "epoch": 0.6030844447639933, + "loss/policy_avg": 0.27316632866859436, + "lr": 2.5981595092024544e-06, + "objective/entropy": -110.1327896118164, + "objective/kl": 14.059263229370117, + "objective/non_score_reward": -1.40592622756958, + "objective/rlhf_reward": -3.798876370462488, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 13.267318725585938, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7431437969207764, + "step": 2096, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000326633453369 + }, + { + "episode": 33568, + "epoch": 0.6033720386813819, + "loss/policy_avg": 0.04450540989637375, + "lr": 2.597967791411043e-06, + "objective/entropy": -200.97593688964844, + "objective/kl": 14.460651397705078, + "objective/non_score_reward": -1.4460651874542236, + "objective/rlhf_reward": -4.050927476088206, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 13.181662559509277, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6776976585388184, + "step": 2097, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983975887298584 + }, + { + "episode": 33584, + "epoch": 0.6036596325987705, + "loss/policy_avg": 0.43644464015960693, + "lr": 2.597776073619632e-06, + "objective/entropy": -169.66427612304688, + "objective/kl": 14.372163772583008, + "objective/non_score_reward": -1.4372166395187378, + "objective/rlhf_reward": -1.348866498470306, + "objective/scores": 1.1, + "policy/approxkl_avg": 105.80145263671875, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.46849268674850464, + "step": 2098, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9996814727783203 + }, + { + "episode": 33600, + "epoch": 0.6039472265161592, + "loss/policy_avg": -0.29122257232666016, + "lr": 2.597584355828221e-06, + "objective/entropy": 185.86685180664062, + "objective/kl": 13.328239440917969, + "objective/non_score_reward": -1.3328239917755127, + "objective/rlhf_reward": -4.9312962055206295, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.79552960395813, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4727972149848938, + "step": 2099, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019476413726807 + }, + { + "episode": 33616, + "epoch": 0.6042348204335478, + "loss/policy_avg": 0.20983460545539856, + "lr": 2.59739263803681e-06, + "objective/entropy": -98.51879119873047, + "objective/kl": 16.45016860961914, + "objective/non_score_reward": -1.6450170278549194, + "objective/rlhf_reward": -4.755239243778298, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 99.38389587402344, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.591879665851593, + "step": 2100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0003983974456787 + }, + { + "episode": 33632, + "epoch": 0.6045224143509365, + "loss/policy_avg": 0.22341737151145935, + "lr": 2.597200920245399e-06, + "objective/entropy": -89.65531921386719, + "objective/kl": 13.054361343383789, + "objective/non_score_reward": -1.305436134338379, + "objective/rlhf_reward": -7.221744537353516, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.273288726806641, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.5509786605834961, + "step": 2101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0029349327087402 + }, + { + "episode": 33648, + "epoch": 0.6048100082683251, + "loss/policy_avg": 0.3788665533065796, + "lr": 2.5970092024539876e-06, + "objective/entropy": 6.503326416015625, + "objective/kl": 19.492740631103516, + "objective/non_score_reward": -1.9492741823196411, + "objective/rlhf_reward": -9.797096252441406, + "objective/scores": -0.5, + "policy/approxkl_avg": 48.16189193725586, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5450559854507446, + "step": 2102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992316961288452 + }, + { + "episode": 33664, + "epoch": 0.6050976021857137, + "loss/policy_avg": -0.001059025526046753, + "lr": 2.5968174846625764e-06, + "objective/entropy": 101.92574310302734, + "objective/kl": 8.146560668945312, + "objective/non_score_reward": -0.8146560192108154, + "objective/rlhf_reward": -0.8586242109537123, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.331566572189331, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.59111088514328, + "step": 2103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000415086746216 + }, + { + "episode": 33680, + "epoch": 0.6053851961031024, + "loss/policy_avg": -0.27263638377189636, + "lr": 2.5966257668711657e-06, + "objective/entropy": 77.05372619628906, + "objective/kl": 17.506303787231445, + "objective/non_score_reward": -1.7506303787231445, + "objective/rlhf_reward": -6.602521447837352, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.02352523803711, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5078814029693604, + "step": 2104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00350284576416 + }, + { + "episode": 33696, + "epoch": 0.605672790020491, + "loss/policy_avg": 0.2301843762397766, + "lr": 2.5964340490797545e-06, + "objective/entropy": 137.177734375, + "objective/kl": 9.50554084777832, + "objective/non_score_reward": -0.9505541324615479, + "objective/rlhf_reward": -1.4022166490554808, + "objective/scores": 0.6, + "policy/approxkl_avg": 3.2152836322784424, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.690402626991272, + "step": 2105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0016870498657227 + }, + { + "episode": 33712, + "epoch": 0.6059603839378798, + "loss/policy_avg": 0.06770651787519455, + "lr": 2.5962423312883437e-06, + "objective/entropy": 2.0992813110351562, + "objective/kl": 9.80885124206543, + "objective/non_score_reward": -0.9808850288391113, + "objective/rlhf_reward": -0.9998212500822273, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 2.15012526512146, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5921926498413086, + "step": 2106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003202438354492 + }, + { + "episode": 33728, + "epoch": 0.6062479778552684, + "loss/policy_avg": 0.25330886244773865, + "lr": 2.5960506134969325e-06, + "objective/entropy": 196.8042449951172, + "objective/kl": 22.02609634399414, + "objective/non_score_reward": -2.2026097774505615, + "objective/rlhf_reward": -4.410439348220825, + "objective/scores": 1.1, + "policy/approxkl_avg": 146.92312622070312, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.510128915309906, + "step": 2107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996383547782898 + }, + { + "episode": 33744, + "epoch": 0.606535571772657, + "loss/policy_avg": 0.3180674910545349, + "lr": 2.5958588957055213e-06, + "objective/entropy": 109.73847198486328, + "objective/kl": 10.875676155090332, + "objective/non_score_reward": -1.087567687034607, + "objective/rlhf_reward": -3.95027065873146, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.97682189941406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3390194773674011, + "step": 2108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9979220628738403 + }, + { + "episode": 33760, + "epoch": 0.6068231656900457, + "loss/policy_avg": 0.31364506483078003, + "lr": 2.5956671779141106e-06, + "objective/entropy": 108.84683227539062, + "objective/kl": 22.46896743774414, + "objective/non_score_reward": -2.246896743774414, + "objective/rlhf_reward": -7.4313281466632635, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 53.97173309326172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.512633204460144, + "step": 2109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998118281364441 + }, + { + "episode": 33776, + "epoch": 0.6071107596074343, + "loss/policy_avg": 0.10164511948823929, + "lr": 2.5954754601226994e-06, + "objective/entropy": 154.34152221679688, + "objective/kl": 13.435369491577148, + "objective/non_score_reward": -1.3435368537902832, + "objective/rlhf_reward": -2.4504285796892376, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 6.303553104400635, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6599438190460205, + "step": 2110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999377965927124 + }, + { + "episode": 33792, + "epoch": 0.607398353524823, + "loss/policy_avg": 0.16709858179092407, + "lr": 2.595283742331288e-06, + "objective/entropy": 188.6673583984375, + "objective/kl": 19.079668045043945, + "objective/non_score_reward": -1.9079668521881104, + "objective/rlhf_reward": -5.68445612020963, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 9.285051345825195, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7654898166656494, + "step": 2111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0025992393493652 + }, + { + "episode": 33808, + "epoch": 0.6076859474422116, + "loss/policy_avg": 0.09121402353048325, + "lr": 2.5950920245398774e-06, + "objective/entropy": 31.230186462402344, + "objective/kl": 15.165401458740234, + "objective/non_score_reward": -1.516540288925171, + "objective/rlhf_reward": -8.066161155700684, + "objective/scores": -0.5, + "policy/approxkl_avg": 43.637332916259766, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7986932396888733, + "step": 2112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979455471038818 + }, + { + "episode": 33824, + "epoch": 0.6079735413596002, + "loss/policy_avg": 0.25094541907310486, + "lr": 2.5949003067484662e-06, + "objective/entropy": -11.48862075805664, + "objective/kl": 18.022008895874023, + "objective/non_score_reward": -1.8022009134292603, + "objective/rlhf_reward": -6.80880377292633, + "objective/scores": 0.1, + "policy/approxkl_avg": 36.03727722167969, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.582868218421936, + "step": 2113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999412298202515 + }, + { + "episode": 33840, + "epoch": 0.6082611352769889, + "loss/policy_avg": 0.4924889802932739, + "lr": 2.5947085889570555e-06, + "objective/entropy": -97.55308532714844, + "objective/kl": 14.677192687988281, + "objective/non_score_reward": -1.467719316482544, + "objective/rlhf_reward": -1.4708772957324978, + "objective/scores": 1.1, + "policy/approxkl_avg": 17.880861282348633, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7399845719337463, + "step": 2114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0017504692077637 + }, + { + "episode": 33856, + "epoch": 0.6085487291943775, + "loss/policy_avg": 0.4132453203201294, + "lr": 2.5945168711656443e-06, + "objective/entropy": 52.88839340209961, + "objective/kl": 18.023258209228516, + "objective/non_score_reward": -1.802325963973999, + "objective/rlhf_reward": -5.3844751968708735, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 36.086788177490234, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5082260370254517, + "step": 2115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986095428466797 + }, + { + "episode": 33872, + "epoch": 0.6088363231117662, + "loss/policy_avg": 0.22052277624607086, + "lr": 2.594325153374233e-06, + "objective/entropy": 129.5975799560547, + "objective/kl": 18.520706176757812, + "objective/non_score_reward": -1.8520703315734863, + "objective/rlhf_reward": -7.008281609416008, + "objective/scores": 0.1, + "policy/approxkl_avg": 51.77848815917969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7177315354347229, + "step": 2116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9968171119689941 + }, + { + "episode": 33888, + "epoch": 0.6091239170291548, + "loss/policy_avg": 0.14162278175354004, + "lr": 2.5941334355828223e-06, + "objective/entropy": -17.719482421875, + "objective/kl": 7.724222183227539, + "objective/non_score_reward": -0.7724223136901855, + "objective/rlhf_reward": 1.3103109240531925, + "objective/scores": 1.1, + "policy/approxkl_avg": 69.4462890625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8562661409378052, + "step": 2117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9992499351501465 + }, + { + "episode": 33904, + "epoch": 0.6094115109465434, + "loss/policy_avg": 0.1741405427455902, + "lr": 2.593941717791411e-06, + "objective/entropy": 275.2198486328125, + "objective/kl": 18.36294937133789, + "objective/non_score_reward": -1.8362950086593628, + "objective/rlhf_reward": -6.94518015384674, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.012439727783203, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9217780828475952, + "step": 2118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999818801879883 + }, + { + "episode": 33920, + "epoch": 0.6096991048639321, + "loss/policy_avg": 4.892614364624023, + "lr": 2.5937500000000004e-06, + "objective/entropy": -6.740753173828125, + "objective/kl": 8.5400390625, + "objective/non_score_reward": -0.85400390625, + "objective/rlhf_reward": -5.416015625, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.4418704509735107, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.538404107093811, + "step": 2119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0201470851898193 + }, + { + "episode": 33936, + "epoch": 0.6099866987813207, + "loss/policy_avg": 0.21670030057430267, + "lr": 2.593558282208589e-06, + "objective/entropy": 2.4147682189941406, + "objective/kl": 15.552010536193848, + "objective/non_score_reward": -1.5552010536193848, + "objective/rlhf_reward": -8.220804214477539, + "objective/scores": -0.5, + "policy/approxkl_avg": 105.36181640625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6115633249282837, + "step": 2120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9976463317871094 + }, + { + "episode": 33952, + "epoch": 0.6102742926987095, + "loss/policy_avg": 0.033039819449186325, + "lr": 2.593366564417178e-06, + "objective/entropy": 73.02333068847656, + "objective/kl": 22.716238021850586, + "objective/non_score_reward": -2.2716236114501953, + "objective/rlhf_reward": -8.686495041847229, + "objective/scores": 0.1, + "policy/approxkl_avg": 73.9190444946289, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.49127280712127686, + "step": 2121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999624252319336 + }, + { + "episode": 33968, + "epoch": 0.6105618866160981, + "loss/policy_avg": -0.7839405536651611, + "lr": 2.5931748466257672e-06, + "objective/entropy": 69.94853973388672, + "objective/kl": 12.057262420654297, + "objective/non_score_reward": -1.2057262659072876, + "objective/rlhf_reward": -0.4229049146175381, + "objective/scores": 1.1, + "policy/approxkl_avg": 41.606651306152344, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6370517015457153, + "step": 2122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0159318447113037 + }, + { + "episode": 33984, + "epoch": 0.6108494805334868, + "loss/policy_avg": 0.07773086428642273, + "lr": 2.5929831288343556e-06, + "objective/entropy": -92.78646087646484, + "objective/kl": 13.057992935180664, + "objective/non_score_reward": -1.3057992458343506, + "objective/rlhf_reward": -0.8231969535350796, + "objective/scores": 1.1, + "policy/approxkl_avg": 11.172804832458496, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6352230310440063, + "step": 2123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.99819016456604 + }, + { + "episode": 34000, + "epoch": 0.6111370744508754, + "loss/policy_avg": 0.22555482387542725, + "lr": 2.592791411042945e-06, + "objective/entropy": -72.2308349609375, + "objective/kl": 17.175045013427734, + "objective/non_score_reward": -1.7175045013427734, + "objective/rlhf_reward": -6.4700182437896725, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.779685974121094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4434419870376587, + "step": 2124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.996457576751709 + }, + { + "episode": 34016, + "epoch": 0.611424668368264, + "loss/policy_avg": 0.05994892120361328, + "lr": 2.5925996932515336e-06, + "objective/entropy": 246.38250732421875, + "objective/kl": 14.213615417480469, + "objective/non_score_reward": -1.4213614463806152, + "objective/rlhf_reward": -7.685445785522461, + "objective/scores": -0.5, + "policy/approxkl_avg": 44.60198974609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7083953619003296, + "step": 2125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002152442932129 + }, + { + "episode": 34032, + "epoch": 0.6117122622856527, + "loss/policy_avg": 0.12799659371376038, + "lr": 2.5924079754601225e-06, + "objective/entropy": 208.40374755859375, + "objective/kl": 19.928834915161133, + "objective/non_score_reward": -1.9928836822509766, + "objective/rlhf_reward": -3.5715342521667477, + "objective/scores": 1.1, + "policy/approxkl_avg": 145.67617797851562, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5317258238792419, + "step": 2126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993952512741089 + }, + { + "episode": 34048, + "epoch": 0.6119998562030413, + "loss/policy_avg": 0.6138862371444702, + "lr": 2.5922162576687117e-06, + "objective/entropy": -125.1441421508789, + "objective/kl": 18.083675384521484, + "objective/non_score_reward": -1.8083676099777222, + "objective/rlhf_reward": -6.833470410108566, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.46153259277344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6206263899803162, + "step": 2127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9993391036987305 + }, + { + "episode": 34064, + "epoch": 0.61228745012043, + "loss/policy_avg": 0.24099749326705933, + "lr": 2.5920245398773005e-06, + "objective/entropy": 63.56041717529297, + "objective/kl": 15.835171699523926, + "objective/non_score_reward": -1.583517074584961, + "objective/rlhf_reward": -4.509239728721688, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 29.088022232055664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7318596243858337, + "step": 2128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989407062530518 + }, + { + "episode": 34080, + "epoch": 0.6125750440378186, + "loss/policy_avg": 0.10305094718933105, + "lr": 2.5918328220858897e-06, + "objective/entropy": 70.02751159667969, + "objective/kl": 16.354942321777344, + "objective/non_score_reward": -1.6354944705963135, + "objective/rlhf_reward": -8.541976928710938, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.052690505981445, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6812468767166138, + "step": 2129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002415180206299 + }, + { + "episode": 34096, + "epoch": 0.6128626379552072, + "loss/policy_avg": 0.42362314462661743, + "lr": 2.5916411042944785e-06, + "objective/entropy": -75.88069152832031, + "objective/kl": 17.92064094543457, + "objective/non_score_reward": -1.7920641899108887, + "objective/rlhf_reward": -5.220845411496098, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 29.6253662109375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.632318377494812, + "step": 2130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992579221725464 + }, + { + "episode": 34112, + "epoch": 0.6131502318725959, + "loss/policy_avg": 0.37457984685897827, + "lr": 2.5914493865030674e-06, + "objective/entropy": 124.02549743652344, + "objective/kl": 18.799131393432617, + "objective/non_score_reward": -1.879913091659546, + "objective/rlhf_reward": -9.5196533203125, + "objective/scores": -0.5, + "policy/approxkl_avg": 69.57762908935547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6399707794189453, + "step": 2131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9947148561477661 + }, + { + "episode": 34128, + "epoch": 0.6134378257899845, + "loss/policy_avg": 0.46973419189453125, + "lr": 2.5912576687116566e-06, + "objective/entropy": -160.57627868652344, + "objective/kl": 11.583633422851562, + "objective/non_score_reward": -1.1583632230758667, + "objective/rlhf_reward": -1.709734116436216, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.947965621948242, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7383812665939331, + "step": 2132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979161024093628 + }, + { + "episode": 34144, + "epoch": 0.6137254197073732, + "loss/policy_avg": 0.0801706463098526, + "lr": 2.5910659509202454e-06, + "objective/entropy": -42.889068603515625, + "objective/kl": 15.087331771850586, + "objective/non_score_reward": -1.5087332725524902, + "objective/rlhf_reward": -1.6349332094192501, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.216672897338867, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4354102611541748, + "step": 2133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970632791519165 + }, + { + "episode": 34160, + "epoch": 0.6140130136247618, + "loss/policy_avg": 0.3341831564903259, + "lr": 2.5908742331288346e-06, + "objective/entropy": -53.92280197143555, + "objective/kl": 22.34456443786621, + "objective/non_score_reward": -2.2344565391540527, + "objective/rlhf_reward": -10.937826156616211, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.245609283447266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7126411199569702, + "step": 2134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983025789260864 + }, + { + "episode": 34176, + "epoch": 0.6143006075421504, + "loss/policy_avg": -0.20949918031692505, + "lr": 2.5906825153374234e-06, + "objective/entropy": 16.97821044921875, + "objective/kl": 14.048188209533691, + "objective/non_score_reward": -1.4048187732696533, + "objective/rlhf_reward": -7.619275093078613, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.851763725280762, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.42240235209465027, + "step": 2135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991393089294434 + }, + { + "episode": 34192, + "epoch": 0.6145882014595392, + "loss/policy_avg": 0.36326050758361816, + "lr": 2.5904907975460122e-06, + "objective/entropy": -49.466636657714844, + "objective/kl": 14.126543045043945, + "objective/non_score_reward": -1.412654161453247, + "objective/rlhf_reward": -3.9172833124796544, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 15.922784805297852, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5579012632369995, + "step": 2136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999401569366455 + }, + { + "episode": 34208, + "epoch": 0.6148757953769278, + "loss/policy_avg": 0.023842569440603256, + "lr": 2.5902990797546015e-06, + "objective/entropy": 50.25873565673828, + "objective/kl": 14.170000076293945, + "objective/non_score_reward": -1.4170001745224, + "objective/rlhf_reward": -5.268000727891922, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.010520935058594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5795632600784302, + "step": 2137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991328716278076 + }, + { + "episode": 34224, + "epoch": 0.6151633892943165, + "loss/policy_avg": 0.04017828032374382, + "lr": 2.5901073619631903e-06, + "objective/entropy": 88.83770751953125, + "objective/kl": 18.84758186340332, + "objective/non_score_reward": -1.8847582340240479, + "objective/rlhf_reward": -7.139032757282257, + "objective/scores": 0.1, + "policy/approxkl_avg": 74.74348449707031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5087491273880005, + "step": 2138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004384517669678 + }, + { + "episode": 34240, + "epoch": 0.6154509832117051, + "loss/policy_avg": 0.026270896196365356, + "lr": 2.589915644171779e-06, + "objective/entropy": 81.49057006835938, + "objective/kl": 17.17093276977539, + "objective/non_score_reward": -1.7170931100845337, + "objective/rlhf_reward": -6.468372440338134, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.445743560791016, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4986514449119568, + "step": 2139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982997179031372 + }, + { + "episode": 34256, + "epoch": 0.6157385771290937, + "loss/policy_avg": 0.1535506248474121, + "lr": 2.5897239263803683e-06, + "objective/entropy": 138.4166259765625, + "objective/kl": 15.88475227355957, + "objective/non_score_reward": -1.588475227355957, + "objective/rlhf_reward": -8.353900909423828, + "objective/scores": -0.5, + "policy/approxkl_avg": 81.95653533935547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6369169354438782, + "step": 2140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999618530273438 + }, + { + "episode": 34272, + "epoch": 0.6160261710464824, + "loss/policy_avg": 0.22110715508460999, + "lr": 2.589532208588957e-06, + "objective/entropy": -51.144073486328125, + "objective/kl": 16.943445205688477, + "objective/non_score_reward": -1.6943447589874268, + "objective/rlhf_reward": -2.377378916740417, + "objective/scores": 1.1, + "policy/approxkl_avg": 32.16203689575195, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5588784217834473, + "step": 2141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998297929763794 + }, + { + "episode": 34288, + "epoch": 0.616313764963871, + "loss/policy_avg": -0.39085668325424194, + "lr": 2.5893404907975464e-06, + "objective/entropy": 218.20677185058594, + "objective/kl": 11.77801513671875, + "objective/non_score_reward": -1.177801489830017, + "objective/rlhf_reward": -2.5884996376195293, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 39.3564338684082, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6796581745147705, + "step": 2142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00933837890625 + }, + { + "episode": 34304, + "epoch": 0.6166013588812597, + "loss/policy_avg": 0.7143984436988831, + "lr": 2.589148773006135e-06, + "objective/entropy": 257.098388671875, + "objective/kl": 25.953664779663086, + "objective/non_score_reward": -2.5953664779663086, + "objective/rlhf_reward": -8.258759739176305, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 243.64572143554688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6894684433937073, + "step": 2143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984405040740967 + }, + { + "episode": 34320, + "epoch": 0.6168889527986483, + "loss/policy_avg": 0.07599098980426788, + "lr": 2.588957055214724e-06, + "objective/entropy": -22.387248992919922, + "objective/kl": 22.233367919921875, + "objective/non_score_reward": -2.223336935043335, + "objective/rlhf_reward": -10.89334774017334, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.48845863342285, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.804703950881958, + "step": 2144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000074863433838 + }, + { + "episode": 34336, + "epoch": 0.617176546716037, + "loss/policy_avg": 0.28670692443847656, + "lr": 2.588765337423313e-06, + "objective/entropy": 70.882568359375, + "objective/kl": 14.724905014038086, + "objective/non_score_reward": -1.4724903106689453, + "objective/rlhf_reward": -3.4899616003036495, + "objective/scores": 0.6, + "policy/approxkl_avg": 10.478135108947754, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7913788557052612, + "step": 2145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978686571121216 + }, + { + "episode": 34352, + "epoch": 0.6174641406334256, + "loss/policy_avg": 0.19924291968345642, + "lr": 2.5885736196319016e-06, + "objective/entropy": 123.67643737792969, + "objective/kl": 19.60727310180664, + "objective/non_score_reward": -1.960727334022522, + "objective/rlhf_reward": -7.4429093360900875, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.37279510498047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49038147926330566, + "step": 2146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988164901733398 + }, + { + "episode": 34368, + "epoch": 0.6177517345508142, + "loss/policy_avg": 0.12335044145584106, + "lr": 2.588381901840491e-06, + "objective/entropy": 158.38768005371094, + "objective/kl": 7.689789772033691, + "objective/non_score_reward": -0.768979012966156, + "objective/rlhf_reward": -0.9532098195710517, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 3.716804027557373, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.700070858001709, + "step": 2147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9987845420837402 + }, + { + "episode": 34384, + "epoch": 0.6180393284682029, + "loss/policy_avg": 0.7577451467514038, + "lr": 2.5881901840490797e-06, + "objective/entropy": 111.6480712890625, + "objective/kl": 20.38470458984375, + "objective/non_score_reward": -2.038470506668091, + "objective/rlhf_reward": -10.153882026672363, + "objective/scores": -0.5, + "policy/approxkl_avg": 47.94173049926758, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6813660860061646, + "step": 2148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996566772460938 + }, + { + "episode": 34400, + "epoch": 0.6183269223855915, + "loss/policy_avg": 0.2539398670196533, + "lr": 2.5879984662576685e-06, + "objective/entropy": -34.9099006652832, + "objective/kl": 8.784804344177246, + "objective/non_score_reward": -0.8784804940223694, + "objective/rlhf_reward": -3.113921976089477, + "objective/scores": 0.1, + "policy/approxkl_avg": 26.73517608642578, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671658992767334, + "step": 2149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0013880729675293 + }, + { + "episode": 34416, + "epoch": 0.6186145163029801, + "loss/policy_avg": 0.14950716495513916, + "lr": 2.5878067484662577e-06, + "objective/entropy": 6.590118408203125, + "objective/kl": 18.786617279052734, + "objective/non_score_reward": -1.8786617517471313, + "objective/rlhf_reward": -3.1146468877792355, + "objective/scores": 1.1, + "policy/approxkl_avg": 25.397262573242188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5859331488609314, + "step": 2150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9985339641571045 + }, + { + "episode": 34432, + "epoch": 0.6189021102203688, + "loss/policy_avg": -0.10131794214248657, + "lr": 2.5876150306748465e-06, + "objective/entropy": 168.03851318359375, + "objective/kl": 5.868832588195801, + "objective/non_score_reward": -0.5868832468986511, + "objective/rlhf_reward": 2.052467012405396, + "objective/scores": 1.1, + "policy/approxkl_avg": 2.2829232215881348, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6661413908004761, + "step": 2151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.008175849914551 + }, + { + "episode": 34448, + "epoch": 0.6191897041377575, + "loss/policy_avg": 0.39233455061912537, + "lr": 2.5874233128834357e-06, + "objective/entropy": 115.36898040771484, + "objective/kl": 11.404361724853516, + "objective/non_score_reward": -1.1404361724853516, + "objective/rlhf_reward": -6.561744689941406, + "objective/scores": -0.5, + "policy/approxkl_avg": 32.5582275390625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8021819591522217, + "step": 2152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.00008487701416 + }, + { + "episode": 34464, + "epoch": 0.6194772980551462, + "loss/policy_avg": 0.2662584185600281, + "lr": 2.5872315950920245e-06, + "objective/entropy": 249.67864990234375, + "objective/kl": 17.200284957885742, + "objective/non_score_reward": -1.7200286388397217, + "objective/rlhf_reward": -8.880114555358887, + "objective/scores": -0.5, + "policy/approxkl_avg": 23.378746032714844, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6200641393661499, + "step": 2153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999152660369873 + }, + { + "episode": 34480, + "epoch": 0.6197648919725348, + "loss/policy_avg": -0.2773422300815582, + "lr": 2.5870398773006134e-06, + "objective/entropy": 203.25189208984375, + "objective/kl": 16.20944595336914, + "objective/non_score_reward": -1.620944619178772, + "objective/rlhf_reward": -6.083778536319732, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.978534698486328, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7069727182388306, + "step": 2154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999535083770752 + }, + { + "episode": 34496, + "epoch": 0.6200524858899235, + "loss/policy_avg": 0.25541549921035767, + "lr": 2.5868481595092026e-06, + "objective/entropy": -46.375946044921875, + "objective/kl": 12.585369110107422, + "objective/non_score_reward": -1.2585369348526, + "objective/rlhf_reward": -4.634147977828979, + "objective/scores": 0.1, + "policy/approxkl_avg": 53.30701446533203, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4830355644226074, + "step": 2155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997302293777466 + }, + { + "episode": 34512, + "epoch": 0.6203400798073121, + "loss/policy_avg": 0.5278187394142151, + "lr": 2.5866564417177914e-06, + "objective/entropy": 19.008319854736328, + "objective/kl": 10.38141918182373, + "objective/non_score_reward": -1.0381418466567993, + "objective/rlhf_reward": 0.24743238985538518, + "objective/scores": 1.1, + "policy/approxkl_avg": 39.551109313964844, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7112669348716736, + "step": 2156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0008163452148438 + }, + { + "episode": 34528, + "epoch": 0.6206276737247007, + "loss/policy_avg": 0.23893393576145172, + "lr": 2.5864647239263806e-06, + "objective/entropy": -12.230049133300781, + "objective/kl": 22.22760009765625, + "objective/non_score_reward": -2.2227602005004883, + "objective/rlhf_reward": -4.491040995717048, + "objective/scores": 1.1, + "policy/approxkl_avg": 50.70768737792969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7049212455749512, + "step": 2157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9978277683258057 + }, + { + "episode": 34544, + "epoch": 0.6209152676420894, + "loss/policy_avg": 0.23087981343269348, + "lr": 2.5862730061349694e-06, + "objective/entropy": 85.62858581542969, + "objective/kl": 15.143526077270508, + "objective/non_score_reward": -1.5143526792526245, + "objective/rlhf_reward": -8.057411193847656, + "objective/scores": -0.5, + "policy/approxkl_avg": 98.90402221679688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7348974943161011, + "step": 2158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0005483627319336 + }, + { + "episode": 34560, + "epoch": 0.621202861559478, + "loss/policy_avg": 0.13627511262893677, + "lr": 2.5860812883435583e-06, + "objective/entropy": 126.21749877929688, + "objective/kl": 13.286699295043945, + "objective/non_score_reward": -1.3286700248718262, + "objective/rlhf_reward": -2.390960876585218, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 29.746185302734375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8702835440635681, + "step": 2159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9985427856445312 + }, + { + "episode": 34576, + "epoch": 0.6214904554768667, + "loss/policy_avg": 0.5095393657684326, + "lr": 2.5858895705521475e-06, + "objective/entropy": -173.3975830078125, + "objective/kl": 19.235570907592773, + "objective/non_score_reward": -1.9235572814941406, + "objective/rlhf_reward": -5.746817897038396, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 23.77338981628418, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4544439911842346, + "step": 2160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9970526695251465 + }, + { + "episode": 34592, + "epoch": 0.6217780493942553, + "loss/policy_avg": -0.14496827125549316, + "lr": 2.5856978527607363e-06, + "objective/entropy": -33.920597076416016, + "objective/kl": 12.791587829589844, + "objective/non_score_reward": -1.2791587114334106, + "objective/rlhf_reward": -4.716634964942932, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.218846321105957, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7137322425842285, + "step": 2161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000558376312256 + }, + { + "episode": 34608, + "epoch": 0.6220656433116439, + "loss/policy_avg": 0.478646457195282, + "lr": 2.585506134969325e-06, + "objective/entropy": 182.32119750976562, + "objective/kl": 20.583545684814453, + "objective/non_score_reward": -2.058354616165161, + "objective/rlhf_reward": -6.408590073856423, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 24.505878448486328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5689951181411743, + "step": 2162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981651306152344 + }, + { + "episode": 34624, + "epoch": 0.6223532372290326, + "loss/policy_avg": 0.2710217237472534, + "lr": 2.5853144171779143e-06, + "objective/entropy": 16.150726318359375, + "objective/kl": 15.956924438476562, + "objective/non_score_reward": -1.5956923961639404, + "objective/rlhf_reward": -5.9827698230743405, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.574764251708984, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8749411106109619, + "step": 2163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991207122802734 + }, + { + "episode": 34640, + "epoch": 0.6226408311464212, + "loss/policy_avg": 0.17157940566539764, + "lr": 2.585122699386503e-06, + "objective/entropy": 147.69705200195312, + "objective/kl": 12.435047149658203, + "objective/non_score_reward": -1.2435047626495361, + "objective/rlhf_reward": -6.974018573760986, + "objective/scores": -0.5, + "policy/approxkl_avg": 39.84374237060547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7384412288665771, + "step": 2164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993503093719482 + }, + { + "episode": 34656, + "epoch": 0.6229284250638099, + "loss/policy_avg": 0.39116328954696655, + "lr": 2.5849309815950924e-06, + "objective/entropy": 124.02345275878906, + "objective/kl": 17.45986557006836, + "objective/non_score_reward": -1.7459867000579834, + "objective/rlhf_reward": -6.583946830034256, + "objective/scores": 0.1, + "policy/approxkl_avg": 131.04702758789062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.602171003818512, + "step": 2165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996525526046753 + }, + { + "episode": 34672, + "epoch": 0.6232160189811985, + "loss/policy_avg": -0.323068767786026, + "lr": 2.584739263803681e-06, + "objective/entropy": 23.200973510742188, + "objective/kl": 19.314199447631836, + "objective/non_score_reward": -1.931420087814331, + "objective/rlhf_reward": -4.8019612475645275, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 11.689594268798828, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.6117805242538452, + "step": 2166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.005882501602173 + }, + { + "episode": 34688, + "epoch": 0.6235036128985872, + "loss/policy_avg": 0.07887321710586548, + "lr": 2.58454754601227e-06, + "objective/entropy": 207.01947021484375, + "objective/kl": 16.987045288085938, + "objective/non_score_reward": -1.6987048387527466, + "objective/rlhf_reward": -6.394819355010986, + "objective/scores": 0.1, + "policy/approxkl_avg": 18.009706497192383, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5339441299438477, + "step": 2167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004165172576904 + }, + { + "episode": 34704, + "epoch": 0.6237912068159759, + "loss/policy_avg": 0.6891222596168518, + "lr": 2.584355828220859e-06, + "objective/entropy": -21.742630004882812, + "objective/kl": 17.947301864624023, + "objective/non_score_reward": -1.794730305671692, + "objective/rlhf_reward": -6.7789211332798, + "objective/scores": 0.1, + "policy/approxkl_avg": 28.476604461669922, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7632720470428467, + "step": 2168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9985806941986084 + }, + { + "episode": 34720, + "epoch": 0.6240788007333645, + "loss/policy_avg": 0.48487964272499084, + "lr": 2.5841641104294476e-06, + "objective/entropy": 294.52569580078125, + "objective/kl": 16.802213668823242, + "objective/non_score_reward": -1.6802215576171875, + "objective/rlhf_reward": -4.320886051654815, + "objective/scores": 0.6, + "policy/approxkl_avg": 57.61943054199219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7062151432037354, + "step": 2169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998595952987671 + }, + { + "episode": 34736, + "epoch": 0.6243663946507532, + "loss/policy_avg": 0.16234064102172852, + "lr": 2.583972392638037e-06, + "objective/entropy": -8.229663848876953, + "objective/kl": 12.112107276916504, + "objective/non_score_reward": -1.21121084690094, + "objective/rlhf_reward": -4.444843447208404, + "objective/scores": 0.1, + "policy/approxkl_avg": 66.40957641601562, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.623025119304657, + "step": 2170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0005271434783936 + }, + { + "episode": 34752, + "epoch": 0.6246539885681418, + "loss/policy_avg": 0.4374160170555115, + "lr": 2.5837806748466257e-06, + "objective/entropy": 215.26991271972656, + "objective/kl": 14.201522827148438, + "objective/non_score_reward": -1.4201524257659912, + "objective/rlhf_reward": -7.680609703063965, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.60218811035156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6455620527267456, + "step": 2171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000101089477539 + }, + { + "episode": 34768, + "epoch": 0.6249415824855304, + "loss/policy_avg": 0.9038959741592407, + "lr": 2.583588957055215e-06, + "objective/entropy": 164.766357421875, + "objective/kl": 10.354567527770996, + "objective/non_score_reward": -1.035456895828247, + "objective/rlhf_reward": -6.141827583312988, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.68964385986328, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4149360656738281, + "step": 2172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0044846534729004 + }, + { + "episode": 34784, + "epoch": 0.6252291764029191, + "loss/policy_avg": -0.24729087948799133, + "lr": 2.5833972392638037e-06, + "objective/entropy": 109.50682067871094, + "objective/kl": 12.598501205444336, + "objective/non_score_reward": -1.259850263595581, + "objective/rlhf_reward": -4.639400786161422, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.71780776977539, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6037960648536682, + "step": 2173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0024328231811523 + }, + { + "episode": 34800, + "epoch": 0.6255167703203077, + "loss/policy_avg": -0.07739436626434326, + "lr": 2.5832055214723925e-06, + "objective/entropy": 82.29547119140625, + "objective/kl": 13.763072967529297, + "objective/non_score_reward": -1.3763072490692139, + "objective/rlhf_reward": -3.901109311644154, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 19.90868377685547, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4547133445739746, + "step": 2174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001060962677 + }, + { + "episode": 34816, + "epoch": 0.6258043642376964, + "loss/policy_avg": 0.5104341506958008, + "lr": 2.5830138036809817e-06, + "objective/entropy": 193.64498901367188, + "objective/kl": 19.853286743164062, + "objective/non_score_reward": -1.9853289127349854, + "objective/rlhf_reward": -7.54131588935852, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.88241958618164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8123583793640137, + "step": 2175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996779441833496 + }, + { + "episode": 34832, + "epoch": 0.626091958155085, + "loss/policy_avg": 0.26050132513046265, + "lr": 2.5828220858895706e-06, + "objective/entropy": 112.65647888183594, + "objective/kl": 15.261848449707031, + "objective/non_score_reward": -1.5261847972869873, + "objective/rlhf_reward": -5.704739069938659, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.26208209991455, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5949582457542419, + "step": 2176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002542018890381 + }, + { + "episode": 34848, + "epoch": 0.6263795520724736, + "loss/policy_avg": -0.006953395903110504, + "lr": 2.5826303680981594e-06, + "objective/entropy": -35.79608154296875, + "objective/kl": 8.320389747619629, + "objective/non_score_reward": -0.8320389986038208, + "objective/rlhf_reward": -1.2054497919240332, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.80181622505188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5749397873878479, + "step": 2177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.003983974456787 + }, + { + "episode": 34864, + "epoch": 0.6266671459898623, + "loss/policy_avg": 0.44770270586013794, + "lr": 2.5824386503067486e-06, + "objective/entropy": -24.63408851623535, + "objective/kl": 14.420526504516602, + "objective/non_score_reward": -1.4420526027679443, + "objective/rlhf_reward": -7.768210411071777, + "objective/scores": -0.5, + "policy/approxkl_avg": 61.44035339355469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6937098503112793, + "step": 2178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9966654777526855 + }, + { + "episode": 34880, + "epoch": 0.6269547399072509, + "loss/policy_avg": 0.4457979202270508, + "lr": 2.5822469325153374e-06, + "objective/entropy": 13.465904235839844, + "objective/kl": 19.523706436157227, + "objective/non_score_reward": -1.9523706436157227, + "objective/rlhf_reward": -4.885763649584028, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 37.15239715576172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5032546520233154, + "step": 2179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000812530517578 + }, + { + "episode": 34896, + "epoch": 0.6272423338246396, + "loss/policy_avg": 0.487781822681427, + "lr": 2.5820552147239266e-06, + "objective/entropy": 143.8909912109375, + "objective/kl": 19.466087341308594, + "objective/non_score_reward": -1.9466089010238647, + "objective/rlhf_reward": -5.663729192987953, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 75.92357635498047, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.524456262588501, + "step": 2180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9970227479934692 + }, + { + "episode": 34912, + "epoch": 0.6275299277420282, + "loss/policy_avg": 0.6310595273971558, + "lr": 2.5818634969325154e-06, + "objective/entropy": 20.076377868652344, + "objective/kl": 9.920660018920898, + "objective/non_score_reward": -0.9920661449432373, + "objective/rlhf_reward": -3.568264371156692, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.638607025146484, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5525496006011963, + "step": 2181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988200664520264 + }, + { + "episode": 34928, + "epoch": 0.627817521659417, + "loss/policy_avg": -0.31129300594329834, + "lr": 2.5816717791411043e-06, + "objective/entropy": 63.76435089111328, + "objective/kl": 15.30907154083252, + "objective/non_score_reward": -1.530907154083252, + "objective/rlhf_reward": -5.723628854751587, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.06525802612305, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5918775200843811, + "step": 2182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001953125 + }, + { + "episode": 34944, + "epoch": 0.6281051155768056, + "loss/policy_avg": 0.24727630615234375, + "lr": 2.5814800613496935e-06, + "objective/entropy": 208.6817626953125, + "objective/kl": 15.481653213500977, + "objective/non_score_reward": -1.5481653213500977, + "objective/rlhf_reward": -3.7926613152027127, + "objective/scores": 0.6, + "policy/approxkl_avg": 18.966259002685547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5907472372055054, + "step": 2183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998089075088501 + }, + { + "episode": 34960, + "epoch": 0.6283927094941942, + "loss/policy_avg": 0.40158331394195557, + "lr": 2.5812883435582823e-06, + "objective/entropy": 100.16302490234375, + "objective/kl": 16.88314437866211, + "objective/non_score_reward": -1.6883143186569214, + "objective/rlhf_reward": -6.353257274627685, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.46660614013672, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5955220460891724, + "step": 2184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998399257659912 + }, + { + "episode": 34976, + "epoch": 0.6286803034115829, + "loss/policy_avg": 0.44291186332702637, + "lr": 2.5810966257668715e-06, + "objective/entropy": 107.09803009033203, + "objective/kl": 20.636152267456055, + "objective/non_score_reward": -2.063615322113037, + "objective/rlhf_reward": -5.330741797329161, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 71.88838958740234, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6331840753555298, + "step": 2185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.00006103515625 + }, + { + "episode": 34992, + "epoch": 0.6289678973289715, + "loss/policy_avg": 0.20810046792030334, + "lr": 2.5809049079754603e-06, + "objective/entropy": 352.854736328125, + "objective/kl": 21.351238250732422, + "objective/non_score_reward": -2.1351239681243896, + "objective/rlhf_reward": -8.140495753288269, + "objective/scores": 0.1, + "policy/approxkl_avg": 64.23518371582031, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8724610805511475, + "step": 2186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992315769195557 + }, + { + "episode": 35008, + "epoch": 0.6292554912463602, + "loss/policy_avg": 0.1791592687368393, + "lr": 2.580713190184049e-06, + "objective/entropy": 237.42965698242188, + "objective/kl": 15.62234115600586, + "objective/non_score_reward": -1.5622341632843018, + "objective/rlhf_reward": -3.8489364743232723, + "objective/scores": 0.6, + "policy/approxkl_avg": 95.47721862792969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9437090158462524, + "step": 2187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988491535186768 + }, + { + "episode": 35024, + "epoch": 0.6295430851637488, + "loss/policy_avg": 0.3126150369644165, + "lr": 2.5805214723926384e-06, + "objective/entropy": -6.007709503173828, + "objective/kl": 25.649818420410156, + "objective/non_score_reward": -2.5649819374084473, + "objective/rlhf_reward": -7.336208750249121, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 263.31591796875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4821949005126953, + "step": 2188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998019814491272 + }, + { + "episode": 35040, + "epoch": 0.6298306790811374, + "loss/policy_avg": 0.041694313287734985, + "lr": 2.580329754601227e-06, + "objective/entropy": -69.11876678466797, + "objective/kl": 11.226625442504883, + "objective/non_score_reward": -1.1226625442504883, + "objective/rlhf_reward": -4.090650296211242, + "objective/scores": 0.1, + "policy/approxkl_avg": 0.908916711807251, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4388587474822998, + "step": 2189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0019636154174805 + }, + { + "episode": 35056, + "epoch": 0.6301182729985261, + "loss/policy_avg": 0.11040852963924408, + "lr": 2.580138036809816e-06, + "objective/entropy": 247.57484436035156, + "objective/kl": 21.945003509521484, + "objective/non_score_reward": -2.194500207901001, + "objective/rlhf_reward": -10.778000831604004, + "objective/scores": -0.5, + "policy/approxkl_avg": 44.54824447631836, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7419674396514893, + "step": 2190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986804723739624 + }, + { + "episode": 35072, + "epoch": 0.6304058669159147, + "loss/policy_avg": 0.19531464576721191, + "lr": 2.579946319018405e-06, + "objective/entropy": 258.46746826171875, + "objective/kl": 12.224682807922363, + "objective/non_score_reward": -1.222468376159668, + "objective/rlhf_reward": -4.489873206615448, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.236316680908203, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6577352285385132, + "step": 2191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999692678451538 + }, + { + "episode": 35088, + "epoch": 0.6306934608333034, + "loss/policy_avg": 0.2209586203098297, + "lr": 2.5797546012269936e-06, + "objective/entropy": -99.6376953125, + "objective/kl": 20.67704963684082, + "objective/non_score_reward": -2.067704916000366, + "objective/rlhf_reward": -10.270819664001465, + "objective/scores": -0.5, + "policy/approxkl_avg": 105.64271545410156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6112191677093506, + "step": 2192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9972584247589111 + }, + { + "episode": 35104, + "epoch": 0.630981054750692, + "loss/policy_avg": -0.1769150048494339, + "lr": 2.579562883435583e-06, + "objective/entropy": -255.36572265625, + "objective/kl": 13.98208236694336, + "objective/non_score_reward": -1.3982082605361938, + "objective/rlhf_reward": -5.19283310174942, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.08042335510254, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6317181587219238, + "step": 2193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0026397705078125 + }, + { + "episode": 35120, + "epoch": 0.6312686486680806, + "loss/policy_avg": 0.07765771448612213, + "lr": 2.5793711656441717e-06, + "objective/entropy": -76.46177673339844, + "objective/kl": 6.011279106140137, + "objective/non_score_reward": -0.6011279821395874, + "objective/rlhf_reward": -0.8003920353093918, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 2.6643407344818115, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5533638000488281, + "step": 2194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986436367034912 + }, + { + "episode": 35136, + "epoch": 0.6315562425854693, + "loss/policy_avg": 1.0513546466827393, + "lr": 2.579179447852761e-06, + "objective/entropy": 6.0850830078125, + "objective/kl": 20.63864517211914, + "objective/non_score_reward": -2.0638644695281982, + "objective/rlhf_reward": -7.855458116531372, + "objective/scores": 0.1, + "policy/approxkl_avg": 87.43357849121094, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5335657000541687, + "step": 2195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983572959899902 + }, + { + "episode": 35152, + "epoch": 0.6318438365028579, + "loss/policy_avg": 0.9989783763885498, + "lr": 2.5789877300613497e-06, + "objective/entropy": 235.584716796875, + "objective/kl": 14.521580696105957, + "objective/non_score_reward": -1.4521582126617432, + "objective/rlhf_reward": -5.408632612228393, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.214359283447266, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8818501234054565, + "step": 2196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998250961303711 + }, + { + "episode": 35168, + "epoch": 0.6321314304202467, + "loss/policy_avg": 0.17217445373535156, + "lr": 2.5787960122699385e-06, + "objective/entropy": -112.44560241699219, + "objective/kl": 17.418087005615234, + "objective/non_score_reward": -1.7418086528778076, + "objective/rlhf_reward": -8.967233657836914, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.461700439453125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6804555058479309, + "step": 2197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9982492923736572 + }, + { + "episode": 35184, + "epoch": 0.6324190243376353, + "loss/policy_avg": -0.1301862895488739, + "lr": 2.5786042944785278e-06, + "objective/entropy": 202.34921264648438, + "objective/kl": 19.765905380249023, + "objective/non_score_reward": -1.9765905141830444, + "objective/rlhf_reward": -7.506362175941467, + "objective/scores": 0.1, + "policy/approxkl_avg": 22.713653564453125, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7051633596420288, + "step": 2198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999467134475708 + }, + { + "episode": 35200, + "epoch": 0.632706618255024, + "loss/policy_avg": 0.4120163917541504, + "lr": 2.5784125766871166e-06, + "objective/entropy": 178.94874572753906, + "objective/kl": 18.871971130371094, + "objective/non_score_reward": -1.8871972560882568, + "objective/rlhf_reward": -9.548789024353027, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.069515228271484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7663606405258179, + "step": 2199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001192569732666 + }, + { + "episode": 35216, + "epoch": 0.6329942121724126, + "loss/policy_avg": 0.7163809537887573, + "lr": 2.5782208588957054e-06, + "objective/entropy": 66.29650115966797, + "objective/kl": 14.291718482971191, + "objective/non_score_reward": -1.4291718006134033, + "objective/rlhf_reward": -7.716687202453613, + "objective/scores": -0.5, + "policy/approxkl_avg": 101.60475158691406, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5268129706382751, + "step": 2200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996827483177185 + }, + { + "episode": 35232, + "epoch": 0.6332818060898012, + "loss/policy_avg": 0.028121359646320343, + "lr": 2.5780291411042946e-06, + "objective/entropy": 282.8331604003906, + "objective/kl": 16.655906677246094, + "objective/non_score_reward": -1.665590763092041, + "objective/rlhf_reward": -4.262362873554229, + "objective/scores": 0.6, + "policy/approxkl_avg": 1.9305273294448853, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7045209407806396, + "step": 2201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0025622844696045 + }, + { + "episode": 35248, + "epoch": 0.6335694000071899, + "loss/policy_avg": 0.09198611974716187, + "lr": 2.5778374233128834e-06, + "objective/entropy": 9.080646514892578, + "objective/kl": 14.53342056274414, + "objective/non_score_reward": -1.4533421993255615, + "objective/rlhf_reward": -5.413368529081344, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.323448419570923, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8238525390625, + "step": 2202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999985694885254 + }, + { + "episode": 35264, + "epoch": 0.6338569939245785, + "loss/policy_avg": 0.4124908745288849, + "lr": 2.5776457055214726e-06, + "objective/entropy": -354.57403564453125, + "objective/kl": 12.172119140625, + "objective/non_score_reward": -1.2172119617462158, + "objective/rlhf_reward": -4.468847846984863, + "objective/scores": 0.1, + "policy/approxkl_avg": 46.26481628417969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6718429327011108, + "step": 2203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 1.9982786178588867 + }, + { + "episode": 35280, + "epoch": 0.6341445878419671, + "loss/policy_avg": 0.09881316870450974, + "lr": 2.5774539877300615e-06, + "objective/entropy": 38.90171813964844, + "objective/kl": 13.71724796295166, + "objective/non_score_reward": -1.3717248439788818, + "objective/rlhf_reward": -5.086899465322494, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.995763778686523, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6945664882659912, + "step": 2204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9988892078399658 + }, + { + "episode": 35296, + "epoch": 0.6344321817593558, + "loss/policy_avg": 0.31177273392677307, + "lr": 2.5772622699386503e-06, + "objective/entropy": -98.65557861328125, + "objective/kl": 15.464643478393555, + "objective/non_score_reward": -1.5464643239974976, + "objective/rlhf_reward": -3.7858574151992794, + "objective/scores": 0.6, + "policy/approxkl_avg": 19.114368438720703, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5550520420074463, + "step": 2205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9985768795013428 + }, + { + "episode": 35312, + "epoch": 0.6347197756767444, + "loss/policy_avg": 0.22732603549957275, + "lr": 2.5770705521472395e-06, + "objective/entropy": 45.62663650512695, + "objective/kl": 12.204948425292969, + "objective/non_score_reward": -1.2204947471618652, + "objective/rlhf_reward": -2.48197910785675, + "objective/scores": 0.6, + "policy/approxkl_avg": 23.474681854248047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5377674102783203, + "step": 2206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9971551895141602 + }, + { + "episode": 35328, + "epoch": 0.6350073695941331, + "loss/policy_avg": 0.6622054576873779, + "lr": 2.5768788343558283e-06, + "objective/entropy": -16.36989974975586, + "objective/kl": 14.84906005859375, + "objective/non_score_reward": -1.4849061965942383, + "objective/rlhf_reward": -3.816918554083381, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 23.934528350830078, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5687286853790283, + "step": 2207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993095397949219 + }, + { + "episode": 35344, + "epoch": 0.6352949635115217, + "loss/policy_avg": 0.6508431434631348, + "lr": 2.5766871165644175e-06, + "objective/entropy": 12.291481018066406, + "objective/kl": 16.560619354248047, + "objective/non_score_reward": -1.6560620069503784, + "objective/rlhf_reward": -8.624248504638672, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.811452865600586, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7654626369476318, + "step": 2208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9988648891448975 + }, + { + "episode": 35360, + "epoch": 0.6355825574289103, + "loss/policy_avg": 0.24387690424919128, + "lr": 2.5764953987730063e-06, + "objective/entropy": 224.07534790039062, + "objective/kl": 22.381715774536133, + "objective/non_score_reward": -2.238171100616455, + "objective/rlhf_reward": -7.219351843992868, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 65.56282043457031, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.9638044834136963, + "step": 2209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000033140182495 + }, + { + "episode": 35376, + "epoch": 0.635870151346299, + "loss/policy_avg": 0.09040187299251556, + "lr": 2.576303680981595e-06, + "objective/entropy": 79.04753875732422, + "objective/kl": 23.915878295898438, + "objective/non_score_reward": -2.391587495803833, + "objective/rlhf_reward": -9.166350162029268, + "objective/scores": 0.1, + "policy/approxkl_avg": 94.76409912109375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.37892991304397583, + "step": 2210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9963321685791016 + }, + { + "episode": 35392, + "epoch": 0.6361577452636876, + "loss/policy_avg": -0.1181950569152832, + "lr": 2.5761119631901844e-06, + "objective/entropy": 209.74940490722656, + "objective/kl": 15.779610633850098, + "objective/non_score_reward": -1.5779609680175781, + "objective/rlhf_reward": -8.311843872070312, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.557290077209473, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7678151726722717, + "step": 2211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0032005310058594 + }, + { + "episode": 35408, + "epoch": 0.6364453391810764, + "loss/policy_avg": 0.29135096073150635, + "lr": 2.5759202453987728e-06, + "objective/entropy": -126.32437133789062, + "objective/kl": 14.804946899414062, + "objective/non_score_reward": -1.480494499206543, + "objective/rlhf_reward": -5.521978384256363, + "objective/scores": 0.1, + "policy/approxkl_avg": 96.0042724609375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8453115224838257, + "step": 2212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997128963470459 + }, + { + "episode": 35424, + "epoch": 0.636732933098465, + "loss/policy_avg": 0.23182812333106995, + "lr": 2.575728527607362e-06, + "objective/entropy": 116.05543518066406, + "objective/kl": 16.660411834716797, + "objective/non_score_reward": -1.666041374206543, + "objective/rlhf_reward": -3.7404463633310527, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 33.42845916748047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6534185409545898, + "step": 2213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983100891113281 + }, + { + "episode": 35440, + "epoch": 0.6370205270158537, + "loss/policy_avg": 0.3903784453868866, + "lr": 2.575536809815951e-06, + "objective/entropy": -84.97456359863281, + "objective/kl": 11.240859985351562, + "objective/non_score_reward": -1.1240859031677246, + "objective/rlhf_reward": -2.548932800965245, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 29.660133361816406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41411858797073364, + "step": 2214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9963812828063965 + }, + { + "episode": 35456, + "epoch": 0.6373081209332423, + "loss/policy_avg": 0.5894661545753479, + "lr": 2.5753450920245396e-06, + "objective/entropy": 63.571022033691406, + "objective/kl": 17.98499870300293, + "objective/non_score_reward": -1.7984999418258667, + "objective/rlhf_reward": -4.793999528884887, + "objective/scores": 0.6, + "policy/approxkl_avg": 10.82928466796875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6973264217376709, + "step": 2215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998317956924438 + }, + { + "episode": 35472, + "epoch": 0.6375957148506309, + "loss/policy_avg": 0.04164008051156998, + "lr": 2.575153374233129e-06, + "objective/entropy": 111.24397277832031, + "objective/kl": 13.686548233032227, + "objective/non_score_reward": -1.3686549663543701, + "objective/rlhf_reward": -7.4746198654174805, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.952038049697876, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7927234172821045, + "step": 2216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0024447441101074 + }, + { + "episode": 35488, + "epoch": 0.6378833087680196, + "loss/policy_avg": 0.00472402386367321, + "lr": 2.5749616564417177e-06, + "objective/entropy": 40.287269592285156, + "objective/kl": 15.373626708984375, + "objective/non_score_reward": -1.5373626947402954, + "objective/rlhf_reward": -5.749450838565826, + "objective/scores": 0.1, + "policy/approxkl_avg": 15.220986366271973, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6288732290267944, + "step": 2217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989573955535889 + }, + { + "episode": 35504, + "epoch": 0.6381709026854082, + "loss/policy_avg": -0.1521018147468567, + "lr": 2.574769938650307e-06, + "objective/entropy": -65.29877471923828, + "objective/kl": 16.19780921936035, + "objective/non_score_reward": -1.6197808980941772, + "objective/rlhf_reward": -8.479124069213867, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.63850474357605, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7238868474960327, + "step": 2218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0011723041534424 + }, + { + "episode": 35520, + "epoch": 0.6384584966027969, + "loss/policy_avg": 0.22721001505851746, + "lr": 2.5745782208588957e-06, + "objective/entropy": 8.837226867675781, + "objective/kl": 12.960824966430664, + "objective/non_score_reward": -1.296082615852356, + "objective/rlhf_reward": -3.628071083632067, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 2.8337883949279785, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6526169776916504, + "step": 2219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999176025390625 + }, + { + "episode": 35536, + "epoch": 0.6387460905201855, + "loss/policy_avg": 0.32163006067276, + "lr": 2.5743865030674845e-06, + "objective/entropy": 56.67760467529297, + "objective/kl": 23.63744354248047, + "objective/non_score_reward": -2.3637444972991943, + "objective/rlhf_reward": -7.507566521840031, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 38.21224594116211, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7226811647415161, + "step": 2220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99851655960083 + }, + { + "episode": 35552, + "epoch": 0.6390336844375741, + "loss/policy_avg": 0.2586836814880371, + "lr": 2.5741947852760738e-06, + "objective/entropy": 127.28630828857422, + "objective/kl": 14.451157569885254, + "objective/non_score_reward": -1.445115566253662, + "objective/rlhf_reward": -7.780462741851807, + "objective/scores": -0.5, + "policy/approxkl_avg": 10.218864440917969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6248810887336731, + "step": 2221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987244606018066 + }, + { + "episode": 35568, + "epoch": 0.6393212783549628, + "loss/policy_avg": 0.5988067984580994, + "lr": 2.5740030674846626e-06, + "objective/entropy": 310.6684265136719, + "objective/kl": 14.75908374786377, + "objective/non_score_reward": -1.4759085178375244, + "objective/rlhf_reward": -5.503633654117584, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.692106246948242, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7594001293182373, + "step": 2222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993855953216553 + }, + { + "episode": 35584, + "epoch": 0.6396088722723514, + "loss/policy_avg": 0.01884036883711815, + "lr": 2.573811349693252e-06, + "objective/entropy": 269.0919494628906, + "objective/kl": 11.99139404296875, + "objective/non_score_reward": -1.1991393566131592, + "objective/rlhf_reward": -2.396557545661926, + "objective/scores": 0.6, + "policy/approxkl_avg": 5.300789833068848, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6532878279685974, + "step": 2223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982726573944092 + }, + { + "episode": 35600, + "epoch": 0.63989646618974, + "loss/policy_avg": 0.11802544444799423, + "lr": 2.5736196319018406e-06, + "objective/entropy": -66.02192687988281, + "objective/kl": 18.99173927307129, + "objective/non_score_reward": -1.8991740942001343, + "objective/rlhf_reward": -5.473990144506965, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 31.625484466552734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6842813491821289, + "step": 2224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983280897140503 + }, + { + "episode": 35616, + "epoch": 0.6401840601071287, + "loss/policy_avg": 0.12923726439476013, + "lr": 2.5734279141104294e-06, + "objective/entropy": 312.881103515625, + "objective/kl": 20.01099967956543, + "objective/non_score_reward": -2.0011000633239746, + "objective/rlhf_reward": -10.004400253295898, + "objective/scores": -0.5, + "policy/approxkl_avg": 64.80439758300781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8023340106010437, + "step": 2225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9957354068756104 + }, + { + "episode": 35632, + "epoch": 0.6404716540245173, + "loss/policy_avg": 0.16196031868457794, + "lr": 2.5732361963190187e-06, + "objective/entropy": -3.3410987854003906, + "objective/kl": 20.95907974243164, + "objective/non_score_reward": -2.0959081649780273, + "objective/rlhf_reward": -3.983632212877273, + "objective/scores": 1.1, + "policy/approxkl_avg": 58.89521408081055, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8127495050430298, + "step": 2226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9977425336837769 + }, + { + "episode": 35648, + "epoch": 0.640759247941906, + "loss/policy_avg": 0.3808813691139221, + "lr": 2.5730444785276075e-06, + "objective/entropy": -48.94996643066406, + "objective/kl": 17.937030792236328, + "objective/non_score_reward": -1.7937030792236328, + "objective/rlhf_reward": -2.7748126149177548, + "objective/scores": 1.1, + "policy/approxkl_avg": 38.477439880371094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7229498624801636, + "step": 2227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999549150466919 + }, + { + "episode": 35664, + "epoch": 0.6410468418592947, + "loss/policy_avg": 0.5239992737770081, + "lr": 2.5728527607361963e-06, + "objective/entropy": -146.61708068847656, + "objective/kl": 14.59036636352539, + "objective/non_score_reward": -1.4590365886688232, + "objective/rlhf_reward": -4.174287145555602, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 9.376331329345703, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6471960544586182, + "step": 2228, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.998356819152832 + }, + { + "episode": 35680, + "epoch": 0.6413344357766834, + "loss/policy_avg": 0.0809326097369194, + "lr": 2.5726610429447855e-06, + "objective/entropy": 139.27410888671875, + "objective/kl": 12.63823127746582, + "objective/non_score_reward": -1.26382315158844, + "objective/rlhf_reward": -7.05529260635376, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.977476119995117, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9265446662902832, + "step": 2229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994497299194336 + }, + { + "episode": 35696, + "epoch": 0.641622029694072, + "loss/policy_avg": 0.40282607078552246, + "lr": 2.5724693251533743e-06, + "objective/entropy": 74.11846160888672, + "objective/kl": 15.80052375793457, + "objective/non_score_reward": -1.580052375793457, + "objective/rlhf_reward": -5.920209205150604, + "objective/scores": 0.1, + "policy/approxkl_avg": 74.08509826660156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.44877538084983826, + "step": 2230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998140811920166 + }, + { + "episode": 35712, + "epoch": 0.6419096236114606, + "loss/policy_avg": -0.023778066039085388, + "lr": 2.5722776073619635e-06, + "objective/entropy": 160.42071533203125, + "objective/kl": 18.071002960205078, + "objective/non_score_reward": -1.8071002960205078, + "objective/rlhf_reward": -9.228401184082031, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.72297668457031, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7528549432754517, + "step": 2231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997116327285767 + }, + { + "episode": 35728, + "epoch": 0.6421972175288493, + "loss/policy_avg": -0.12553301453590393, + "lr": 2.5720858895705524e-06, + "objective/entropy": -118.97586059570312, + "objective/kl": 12.522685050964355, + "objective/non_score_reward": -1.2522684335708618, + "objective/rlhf_reward": -0.6090736746788021, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.815485000610352, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5027907490730286, + "step": 2232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002737045288086 + }, + { + "episode": 35744, + "epoch": 0.6424848114462379, + "loss/policy_avg": 0.5497890710830688, + "lr": 2.571894171779141e-06, + "objective/entropy": 66.39590454101562, + "objective/kl": 15.05737590789795, + "objective/non_score_reward": -1.5057374238967896, + "objective/rlhf_reward": -5.622949934005737, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.636985778808594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7436261773109436, + "step": 2233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996707439422607 + }, + { + "episode": 35760, + "epoch": 0.6427724053636266, + "loss/policy_avg": 0.035653773695230484, + "lr": 2.57170245398773e-06, + "objective/entropy": 91.70762634277344, + "objective/kl": 21.097057342529297, + "objective/non_score_reward": -2.109705924987793, + "objective/rlhf_reward": -5.5151040896188945, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 11.354876518249512, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8169498443603516, + "step": 2234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000910997390747 + }, + { + "episode": 35776, + "epoch": 0.6430599992810152, + "loss/policy_avg": 0.043960705399513245, + "lr": 2.5715107361963188e-06, + "objective/entropy": -137.8557586669922, + "objective/kl": 16.38060760498047, + "objective/non_score_reward": -1.6380605697631836, + "objective/rlhf_reward": -6.152242219448089, + "objective/scores": 0.1, + "policy/approxkl_avg": 71.73484802246094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5255050659179688, + "step": 2235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994916915893555 + }, + { + "episode": 35792, + "epoch": 0.6433475931984038, + "loss/policy_avg": 0.1638680398464203, + "lr": 2.571319018404908e-06, + "objective/entropy": -82.89593505859375, + "objective/kl": 18.11610221862793, + "objective/non_score_reward": -1.811610221862793, + "objective/rlhf_reward": -6.846440827846527, + "objective/scores": 0.1, + "policy/approxkl_avg": 43.05682373046875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5359176397323608, + "step": 2236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976513385772705 + }, + { + "episode": 35808, + "epoch": 0.6436351871157925, + "loss/policy_avg": 0.5455341339111328, + "lr": 2.571127300613497e-06, + "objective/entropy": 136.18603515625, + "objective/kl": 15.515323638916016, + "objective/non_score_reward": -1.55153226852417, + "objective/rlhf_reward": -8.20612907409668, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.07931137084961, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5907673835754395, + "step": 2237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9959040880203247 + }, + { + "episode": 35824, + "epoch": 0.6439227810331811, + "loss/policy_avg": 0.5567790269851685, + "lr": 2.570935582822086e-06, + "objective/entropy": -230.35806274414062, + "objective/kl": 15.261903762817383, + "objective/non_score_reward": -1.5261905193328857, + "objective/rlhf_reward": -3.7047623157501217, + "objective/scores": 0.6, + "policy/approxkl_avg": 28.395586013793945, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7888268232345581, + "step": 2238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9991778135299683 + }, + { + "episode": 35840, + "epoch": 0.6442103749505698, + "loss/policy_avg": 0.22221067547798157, + "lr": 2.570743865030675e-06, + "objective/entropy": 51.5346794128418, + "objective/kl": 20.33760643005371, + "objective/non_score_reward": -2.0337605476379395, + "objective/rlhf_reward": -10.135042190551758, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.599842071533203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6876698136329651, + "step": 2239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994490146636963 + }, + { + "episode": 35856, + "epoch": 0.6444979688679584, + "loss/policy_avg": 0.24794799089431763, + "lr": 2.5705521472392637e-06, + "objective/entropy": 101.41891479492188, + "objective/kl": 10.934738159179688, + "objective/non_score_reward": -1.0934739112854004, + "objective/rlhf_reward": 0.026104474067688344, + "objective/scores": 1.1, + "policy/approxkl_avg": 10.448841094970703, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7271315455436707, + "step": 2240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9968032836914062 + }, + { + "episode": 35872, + "epoch": 0.644785562785347, + "loss/policy_avg": -0.15643905103206635, + "lr": 2.570360429447853e-06, + "objective/entropy": -17.24741554260254, + "objective/kl": 14.677450180053711, + "objective/non_score_reward": -1.467745065689087, + "objective/rlhf_reward": -1.4709803819656369, + "objective/scores": 1.1, + "policy/approxkl_avg": 27.442018508911133, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.48285290598869324, + "step": 2241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0026421546936035 + }, + { + "episode": 35888, + "epoch": 0.6450731567027357, + "loss/policy_avg": 1.282882571220398, + "lr": 2.5701687116564417e-06, + "objective/entropy": -16.522808074951172, + "objective/kl": 12.279808044433594, + "objective/non_score_reward": -1.2279808521270752, + "objective/rlhf_reward": -2.7892173997321468, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 17.468976974487305, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5521416068077087, + "step": 2242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001244068145752 + }, + { + "episode": 35904, + "epoch": 0.6453607506201244, + "loss/policy_avg": 0.6120246648788452, + "lr": 2.5699769938650305e-06, + "objective/entropy": -58.17369842529297, + "objective/kl": 9.920572280883789, + "objective/non_score_reward": -0.992057204246521, + "objective/rlhf_reward": 0.43177100419998204, + "objective/scores": 1.1, + "policy/approxkl_avg": 12.923457145690918, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5883625745773315, + "step": 2243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0009925365448 + }, + { + "episode": 35920, + "epoch": 0.6456483445375131, + "loss/policy_avg": 0.21067282557487488, + "lr": 2.5697852760736198e-06, + "objective/entropy": -4.984157562255859, + "objective/kl": 18.276622772216797, + "objective/non_score_reward": -1.8276622295379639, + "objective/rlhf_reward": -4.386930142284605, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 100.22476196289062, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6311012506484985, + "step": 2244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.999492883682251 + }, + { + "episode": 35936, + "epoch": 0.6459359384549017, + "loss/policy_avg": 0.12202519178390503, + "lr": 2.5695935582822086e-06, + "objective/entropy": 167.21475219726562, + "objective/kl": 13.268052101135254, + "objective/non_score_reward": -1.3268052339553833, + "objective/rlhf_reward": -7.307220935821533, + "objective/scores": -0.5, + "policy/approxkl_avg": 40.42982482910156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6104716658592224, + "step": 2245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0034589767456055 + }, + { + "episode": 35952, + "epoch": 0.6462235323722904, + "loss/policy_avg": 0.04010404273867607, + "lr": 2.569401840490798e-06, + "objective/entropy": -75.36890411376953, + "objective/kl": 17.823413848876953, + "objective/non_score_reward": -1.782341480255127, + "objective/rlhf_reward": -5.006659509913002, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 15.992976188659668, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5671273469924927, + "step": 2246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0030524730682373 + }, + { + "episode": 35968, + "epoch": 0.646511126289679, + "loss/policy_avg": 0.4200291037559509, + "lr": 2.5692101226993866e-06, + "objective/entropy": -238.55908203125, + "objective/kl": 19.406217575073242, + "objective/non_score_reward": -1.9406214952468872, + "objective/rlhf_reward": -9.76248550415039, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.762123107910156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.527471661567688, + "step": 2247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9961581230163574 + }, + { + "episode": 35984, + "epoch": 0.6467987202070676, + "loss/policy_avg": -0.33676448464393616, + "lr": 2.5690184049079754e-06, + "objective/entropy": -120.24238586425781, + "objective/kl": 13.346169471740723, + "objective/non_score_reward": -1.334617018699646, + "objective/rlhf_reward": -4.9384682610630986, + "objective/scores": 0.1, + "policy/approxkl_avg": 27.310237884521484, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5737497210502625, + "step": 2248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0009799003601074 + }, + { + "episode": 36000, + "epoch": 0.6470863141244563, + "loss/policy_avg": 0.5354056358337402, + "lr": 2.5688266871165647e-06, + "objective/entropy": -36.78726577758789, + "objective/kl": 20.65894889831543, + "objective/non_score_reward": -2.0658950805664062, + "objective/rlhf_reward": -6.707320540156916, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 43.62550354003906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7647566199302673, + "step": 2249, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9989060163497925 + }, + { + "episode": 36016, + "epoch": 0.6473739080418449, + "loss/policy_avg": 0.046671465039253235, + "lr": 2.5686349693251535e-06, + "objective/entropy": 301.4794921875, + "objective/kl": 17.496397018432617, + "objective/non_score_reward": -1.7496397495269775, + "objective/rlhf_reward": -8.998558044433594, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.562626838684082, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.73357754945755, + "step": 2250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000436305999756 + }, + { + "episode": 36032, + "epoch": 0.6476615019592336, + "loss/policy_avg": 0.14498819410800934, + "lr": 2.5684432515337423e-06, + "objective/entropy": -76.91134643554688, + "objective/kl": 21.101152420043945, + "objective/non_score_reward": -2.1101155281066895, + "objective/rlhf_reward": -4.0404615759849545, + "objective/scores": 1.1, + "policy/approxkl_avg": 70.1800537109375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.527640700340271, + "step": 2251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9971266984939575 + }, + { + "episode": 36048, + "epoch": 0.6479490958766222, + "loss/policy_avg": 0.6092630624771118, + "lr": 2.5682515337423315e-06, + "objective/entropy": 156.32025146484375, + "objective/kl": 26.440006256103516, + "objective/non_score_reward": -2.644000768661499, + "objective/rlhf_reward": -8.176003074645998, + "objective/scores": 0.6, + "policy/approxkl_avg": 122.7042007446289, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6378190517425537, + "step": 2252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993071556091309 + }, + { + "episode": 36064, + "epoch": 0.6482366897940108, + "loss/policy_avg": 0.08317308127880096, + "lr": 2.5680598159509203e-06, + "objective/entropy": -141.36428833007812, + "objective/kl": 18.52130889892578, + "objective/non_score_reward": -1.8521307706832886, + "objective/rlhf_reward": -9.408522605895996, + "objective/scores": -0.5, + "policy/approxkl_avg": 117.25154113769531, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.835111141204834, + "step": 2253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9980623722076416 + }, + { + "episode": 36080, + "epoch": 0.6485242837113995, + "loss/policy_avg": 0.1629417985677719, + "lr": 2.5678680981595096e-06, + "objective/entropy": 82.80020141601562, + "objective/kl": 15.670761108398438, + "objective/non_score_reward": -1.5670759677886963, + "objective/rlhf_reward": -3.344585214496824, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 36.081214904785156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5541933178901672, + "step": 2254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9976167678833008 + }, + { + "episode": 36096, + "epoch": 0.6488118776287881, + "loss/policy_avg": 0.596114456653595, + "lr": 2.5676763803680984e-06, + "objective/entropy": 69.89568328857422, + "objective/kl": 22.839479446411133, + "objective/non_score_reward": -2.2839479446411133, + "objective/rlhf_reward": -11.135791778564453, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.495725631713867, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.43410223722457886, + "step": 2255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999086618423462 + }, + { + "episode": 36112, + "epoch": 0.6490994715461768, + "loss/policy_avg": 0.07463404536247253, + "lr": 2.567484662576687e-06, + "objective/entropy": 118.45555877685547, + "objective/kl": 14.987808227539062, + "objective/non_score_reward": -1.4987808465957642, + "objective/rlhf_reward": -7.995123386383057, + "objective/scores": -0.5, + "policy/approxkl_avg": 16.267757415771484, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46388620138168335, + "step": 2256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0005156993865967 + }, + { + "episode": 36128, + "epoch": 0.6493870654635654, + "loss/policy_avg": 0.23220334947109222, + "lr": 2.567292944785276e-06, + "objective/entropy": -73.17439270019531, + "objective/kl": 17.924890518188477, + "objective/non_score_reward": -1.7924890518188477, + "objective/rlhf_reward": -9.16995620727539, + "objective/scores": -0.5, + "policy/approxkl_avg": 41.652099609375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7985033988952637, + "step": 2257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974535703659058 + }, + { + "episode": 36144, + "epoch": 0.6496746593809541, + "loss/policy_avg": 0.2749658524990082, + "lr": 2.567101226993865e-06, + "objective/entropy": 87.86886596679688, + "objective/kl": 14.881671905517578, + "objective/non_score_reward": -1.4881671667099, + "objective/rlhf_reward": -1.5526686072349545, + "objective/scores": 1.1, + "policy/approxkl_avg": 5.160519599914551, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.545424222946167, + "step": 2258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000108242034912 + }, + { + "episode": 36160, + "epoch": 0.6499622532983428, + "loss/policy_avg": 0.3695850372314453, + "lr": 2.566909509202454e-06, + "objective/entropy": -106.71471405029297, + "objective/kl": 15.884528160095215, + "objective/non_score_reward": -1.588452935218811, + "objective/rlhf_reward": -5.953811800479889, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.107734680175781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.608582615852356, + "step": 2259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991270303726196 + }, + { + "episode": 36176, + "epoch": 0.6502498472157314, + "loss/policy_avg": 1.5744253396987915, + "lr": 2.566717791411043e-06, + "objective/entropy": -150.28012084960938, + "objective/kl": 12.480113983154297, + "objective/non_score_reward": -1.2480113506317139, + "objective/rlhf_reward": -6.992045879364014, + "objective/scores": -0.5, + "policy/approxkl_avg": 6.144730091094971, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.687921404838562, + "step": 2260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0005011558532715 + }, + { + "episode": 36192, + "epoch": 0.6505374411331201, + "loss/policy_avg": 0.46961259841918945, + "lr": 2.566526073619632e-06, + "objective/entropy": 355.0657958984375, + "objective/kl": 15.923636436462402, + "objective/non_score_reward": -1.5923638343811035, + "objective/rlhf_reward": -1.9694551587104794, + "objective/scores": 1.1, + "policy/approxkl_avg": 23.633182525634766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8576830625534058, + "step": 2261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997279167175293 + }, + { + "episode": 36208, + "epoch": 0.6508250350505087, + "loss/policy_avg": -0.017100073397159576, + "lr": 2.566334355828221e-06, + "objective/entropy": -42.104312896728516, + "objective/kl": 19.936870574951172, + "objective/non_score_reward": -1.9936869144439697, + "objective/rlhf_reward": -5.051028941513273, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 43.8869514465332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.541776180267334, + "step": 2262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991776943206787 + }, + { + "episode": 36224, + "epoch": 0.6511126289678973, + "loss/policy_avg": 0.16236397624015808, + "lr": 2.5661426380368097e-06, + "objective/entropy": 76.25511169433594, + "objective/kl": 19.869869232177734, + "objective/non_score_reward": -1.986986756324768, + "objective/rlhf_reward": -9.947946548461914, + "objective/scores": -0.5, + "policy/approxkl_avg": 141.39076232910156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6694316864013672, + "step": 2263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994683265686035 + }, + { + "episode": 36240, + "epoch": 0.651400222885286, + "loss/policy_avg": 0.31850236654281616, + "lr": 2.565950920245399e-06, + "objective/entropy": 102.84500122070312, + "objective/kl": 16.424327850341797, + "objective/non_score_reward": -1.642432451248169, + "objective/rlhf_reward": -8.569729804992676, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.593269348144531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7008790373802185, + "step": 2264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999637246131897 + }, + { + "episode": 36256, + "epoch": 0.6516878168026746, + "loss/policy_avg": 0.37176790833473206, + "lr": 2.5657592024539877e-06, + "objective/entropy": -42.34572982788086, + "objective/kl": 16.17561912536621, + "objective/non_score_reward": -1.6175618171691895, + "objective/rlhf_reward": -4.3475415430226665, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 54.164154052734375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8059068918228149, + "step": 2265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9993457794189453 + }, + { + "episode": 36272, + "epoch": 0.6519754107200633, + "loss/policy_avg": 0.2269030511379242, + "lr": 2.5655674846625765e-06, + "objective/entropy": 156.19015502929688, + "objective/kl": 20.02953338623047, + "objective/non_score_reward": -2.002953052520752, + "objective/rlhf_reward": -6.064401517586644, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 3.891697406768799, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.38503801822662354, + "step": 2266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019311904907227 + }, + { + "episode": 36288, + "epoch": 0.6522630046374519, + "loss/policy_avg": 0.3679116368293762, + "lr": 2.5653757668711658e-06, + "objective/entropy": 126.4683609008789, + "objective/kl": 15.896159172058105, + "objective/non_score_reward": -1.589616060256958, + "objective/rlhf_reward": -8.358464241027832, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.862239360809326, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7594268321990967, + "step": 2267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980642795562744 + }, + { + "episode": 36304, + "epoch": 0.6525505985548405, + "loss/policy_avg": 0.6537224650382996, + "lr": 2.5651840490797546e-06, + "objective/entropy": 216.36676025390625, + "objective/kl": 12.314484596252441, + "objective/non_score_reward": -1.2314484119415283, + "objective/rlhf_reward": -4.525793886184692, + "objective/scores": 0.1, + "policy/approxkl_avg": 10.041047096252441, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.9338732957839966, + "step": 2268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9994885921478271 + }, + { + "episode": 36320, + "epoch": 0.6528381924722292, + "loss/policy_avg": -0.007923688739538193, + "lr": 2.564992331288344e-06, + "objective/entropy": -149.3037872314453, + "objective/kl": 11.473003387451172, + "objective/non_score_reward": -1.1473004817962646, + "objective/rlhf_reward": -6.589201927185059, + "objective/scores": -0.5, + "policy/approxkl_avg": 0.23434622585773468, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4156072735786438, + "step": 2269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000920057296753 + }, + { + "episode": 36336, + "epoch": 0.6531257863896178, + "loss/policy_avg": 0.678230345249176, + "lr": 2.5648006134969326e-06, + "objective/entropy": 363.0588073730469, + "objective/kl": 17.33286476135254, + "objective/non_score_reward": -1.7332863807678223, + "objective/rlhf_reward": -8.933145523071289, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.384531021118164, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8610842227935791, + "step": 2270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001692771911621 + }, + { + "episode": 36352, + "epoch": 0.6534133803070065, + "loss/policy_avg": 0.08171165734529495, + "lr": 2.5646088957055214e-06, + "objective/entropy": -11.031600952148438, + "objective/kl": 17.83512306213379, + "objective/non_score_reward": -1.7835124731063843, + "objective/rlhf_reward": -2.7340499520301815, + "objective/scores": 1.1, + "policy/approxkl_avg": 70.58218383789062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6740350723266602, + "step": 2271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9959897994995117 + }, + { + "episode": 36368, + "epoch": 0.6537009742243951, + "loss/policy_avg": 0.17662328481674194, + "lr": 2.5644171779141107e-06, + "objective/entropy": 88.57758331298828, + "objective/kl": 9.481622695922852, + "objective/non_score_reward": -0.9481624364852905, + "objective/rlhf_reward": -3.3926497310400006, + "objective/scores": 0.1, + "policy/approxkl_avg": 4.293864727020264, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5531899929046631, + "step": 2272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0002083778381348 + }, + { + "episode": 36384, + "epoch": 0.6539885681417839, + "loss/policy_avg": -0.1894199550151825, + "lr": 2.5642254601226995e-06, + "objective/entropy": -25.101699829101562, + "objective/kl": 11.592086791992188, + "objective/non_score_reward": -1.1592087745666504, + "objective/rlhf_reward": -2.5141291043916087, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.358327865600586, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.5184176564216614, + "step": 2273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.001044750213623 + }, + { + "episode": 36400, + "epoch": 0.6542761620591725, + "loss/policy_avg": -0.09053383767604828, + "lr": 2.5640337423312887e-06, + "objective/entropy": 98.01313781738281, + "objective/kl": 8.616978645324707, + "objective/non_score_reward": -0.8616980314254761, + "objective/rlhf_reward": -3.0467919468879696, + "objective/scores": 0.1, + "policy/approxkl_avg": 18.988311767578125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5338178277015686, + "step": 2274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000063896179199 + }, + { + "episode": 36416, + "epoch": 0.6545637559765611, + "loss/policy_avg": 0.43914473056793213, + "lr": 2.5638420245398775e-06, + "objective/entropy": 2.6595993041992188, + "objective/kl": 19.53274154663086, + "objective/non_score_reward": -1.9532740116119385, + "objective/rlhf_reward": -9.813096046447754, + "objective/scores": -0.5, + "policy/approxkl_avg": 46.37147521972656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5686173439025879, + "step": 2275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9987603425979614 + }, + { + "episode": 36432, + "epoch": 0.6548513498939498, + "loss/policy_avg": 0.0820363238453865, + "lr": 2.5636503067484663e-06, + "objective/entropy": 145.08493041992188, + "objective/kl": 14.222003936767578, + "objective/non_score_reward": -1.422200322151184, + "objective/rlhf_reward": -3.9554681936899816, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 99.89260864257812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6236465573310852, + "step": 2276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998683214187622 + }, + { + "episode": 36448, + "epoch": 0.6551389438113384, + "loss/policy_avg": 0.00839349627494812, + "lr": 2.5634585889570556e-06, + "objective/entropy": 215.92877197265625, + "objective/kl": 21.089658737182617, + "objective/non_score_reward": -2.1089658737182617, + "objective/rlhf_reward": -10.435863494873047, + "objective/scores": -0.5, + "policy/approxkl_avg": 24.066848754882812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8423826098442078, + "step": 2277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0013651847839355 + }, + { + "episode": 36464, + "epoch": 0.655426537728727, + "loss/policy_avg": 0.675013542175293, + "lr": 2.5632668711656444e-06, + "objective/entropy": -108.94208526611328, + "objective/kl": 19.815250396728516, + "objective/non_score_reward": -1.9815250635147095, + "objective/rlhf_reward": -9.92609977722168, + "objective/scores": -0.5, + "policy/approxkl_avg": 67.00296020507812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8092669248580933, + "step": 2278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9995282888412476 + }, + { + "episode": 36480, + "epoch": 0.6557141316461157, + "loss/policy_avg": 1.2140064239501953, + "lr": 2.563075153374233e-06, + "objective/entropy": -95.05614471435547, + "objective/kl": 16.91280174255371, + "objective/non_score_reward": -1.6912803649902344, + "objective/rlhf_reward": -6.365121459960937, + "objective/scores": 0.1, + "policy/approxkl_avg": 20.181533813476562, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7570371627807617, + "step": 2279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980716705322266 + }, + { + "episode": 36496, + "epoch": 0.6560017255635043, + "loss/policy_avg": 0.0030971169471740723, + "lr": 2.562883435582822e-06, + "objective/entropy": -145.6276397705078, + "objective/kl": 6.605377197265625, + "objective/non_score_reward": -0.6605377197265625, + "objective/rlhf_reward": -2.24215075969696, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.3553369045257568, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5911043882369995, + "step": 2280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0003302097320557 + }, + { + "episode": 36512, + "epoch": 0.656289319480893, + "loss/policy_avg": 0.3546699583530426, + "lr": 2.562691717791411e-06, + "objective/entropy": 233.58958435058594, + "objective/kl": 17.121244430541992, + "objective/non_score_reward": -1.7121243476867676, + "objective/rlhf_reward": -3.9247784956705303, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 12.144831657409668, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8071939945220947, + "step": 2281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0023207664489746 + }, + { + "episode": 36528, + "epoch": 0.6565769133982816, + "loss/policy_avg": 1.2533683776855469, + "lr": 2.5625e-06, + "objective/entropy": 40.26350402832031, + "objective/kl": 11.750067710876465, + "objective/non_score_reward": -1.1750068664550781, + "objective/rlhf_reward": -2.577320875898872, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 34.881553649902344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6683433055877686, + "step": 2282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9997665882110596 + }, + { + "episode": 36544, + "epoch": 0.6568645073156703, + "loss/policy_avg": -0.07329759001731873, + "lr": 2.562308282208589e-06, + "objective/entropy": 74.04084777832031, + "objective/kl": 14.00217342376709, + "objective/non_score_reward": -1.4002174139022827, + "objective/rlhf_reward": -7.600869655609131, + "objective/scores": -0.5, + "policy/approxkl_avg": 15.320232391357422, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.760829508304596, + "step": 2283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000995635986328 + }, + { + "episode": 36560, + "epoch": 0.6571521012330589, + "loss/policy_avg": 0.40319544076919556, + "lr": 2.562116564417178e-06, + "objective/entropy": 132.4573974609375, + "objective/kl": 17.98862075805664, + "objective/non_score_reward": -1.7988622188568115, + "objective/rlhf_reward": -4.795448756217956, + "objective/scores": 0.6, + "policy/approxkl_avg": 17.402114868164062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6830170154571533, + "step": 2284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9980769157409668 + }, + { + "episode": 36576, + "epoch": 0.6574396951504475, + "loss/policy_avg": 0.4163977801799774, + "lr": 2.561924846625767e-06, + "objective/entropy": 52.17835998535156, + "objective/kl": 14.525779724121094, + "objective/non_score_reward": -1.452578067779541, + "objective/rlhf_reward": -1.410312300920486, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.813898086547852, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.811333417892456, + "step": 2285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997317790985107 + }, + { + "episode": 36592, + "epoch": 0.6577272890678362, + "loss/policy_avg": 0.23318399488925934, + "lr": 2.5617331288343557e-06, + "objective/entropy": 176.15615844726562, + "objective/kl": 17.697359085083008, + "objective/non_score_reward": -1.7697358131408691, + "objective/rlhf_reward": -9.078943252563477, + "objective/scores": -0.5, + "policy/approxkl_avg": 78.4808349609375, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7467453479766846, + "step": 2286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9992974996566772 + }, + { + "episode": 36608, + "epoch": 0.6580148829852248, + "loss/policy_avg": 0.1549290418624878, + "lr": 2.561541411042945e-06, + "objective/entropy": 181.3204345703125, + "objective/kl": 21.084064483642578, + "objective/non_score_reward": -2.1084065437316895, + "objective/rlhf_reward": -4.033626532554626, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.693029403686523, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.790191650390625, + "step": 2287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995472431182861 + }, + { + "episode": 36624, + "epoch": 0.6583024769026136, + "loss/policy_avg": -0.020601997151970863, + "lr": 2.5613496932515337e-06, + "objective/entropy": 260.5650634765625, + "objective/kl": 17.291284561157227, + "objective/non_score_reward": -1.729128360748291, + "objective/rlhf_reward": -8.916513442993164, + "objective/scores": -0.5, + "policy/approxkl_avg": 32.037269592285156, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6160063743591309, + "step": 2288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999596357345581 + }, + { + "episode": 36640, + "epoch": 0.6585900708200022, + "loss/policy_avg": 0.19600838422775269, + "lr": 2.561157975460123e-06, + "objective/entropy": -18.00920867919922, + "objective/kl": 9.472246170043945, + "objective/non_score_reward": -0.9472246170043945, + "objective/rlhf_reward": -5.788898468017578, + "objective/scores": -0.5, + "policy/approxkl_avg": 53.95936584472656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6834456920623779, + "step": 2289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998971700668335 + }, + { + "episode": 36656, + "epoch": 0.6588776647373908, + "loss/policy_avg": 0.576319694519043, + "lr": 2.5609662576687118e-06, + "objective/entropy": -37.32265090942383, + "objective/kl": 12.981508255004883, + "objective/non_score_reward": -1.2981507778167725, + "objective/rlhf_reward": -4.7926029920578, + "objective/scores": 0.1, + "policy/approxkl_avg": 4.780340671539307, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5498796701431274, + "step": 2290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9995492696762085 + }, + { + "episode": 36672, + "epoch": 0.6591652586547795, + "loss/policy_avg": 0.64011150598526, + "lr": 2.5607745398773006e-06, + "objective/entropy": 123.31390380859375, + "objective/kl": 21.683334350585938, + "objective/non_score_reward": -2.1683335304260254, + "objective/rlhf_reward": -10.673334121704102, + "objective/scores": -0.5, + "policy/approxkl_avg": 97.21592712402344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6466020345687866, + "step": 2291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990426301956177 + }, + { + "episode": 36688, + "epoch": 0.6594528525721681, + "loss/policy_avg": 0.804427981376648, + "lr": 2.56058282208589e-06, + "objective/entropy": -145.3238525390625, + "objective/kl": 16.019611358642578, + "objective/non_score_reward": -1.6019612550735474, + "objective/rlhf_reward": -2.0078449010848995, + "objective/scores": 1.1, + "policy/approxkl_avg": 84.41429138183594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4610861539840698, + "step": 2292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991902112960815 + }, + { + "episode": 36704, + "epoch": 0.6597404464895568, + "loss/policy_avg": 0.5242794752120972, + "lr": 2.5603911042944786e-06, + "objective/entropy": -87.87837219238281, + "objective/kl": 14.958013534545898, + "objective/non_score_reward": -1.4958014488220215, + "objective/rlhf_reward": -7.983206272125244, + "objective/scores": -0.5, + "policy/approxkl_avg": 20.005577087402344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6943378448486328, + "step": 2293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9993853569030762 + }, + { + "episode": 36720, + "epoch": 0.6600280404069454, + "loss/policy_avg": 0.19961407780647278, + "lr": 2.5601993865030674e-06, + "objective/entropy": -21.319744110107422, + "objective/kl": 11.861923217773438, + "objective/non_score_reward": -1.186192274093628, + "objective/rlhf_reward": -1.8210502012979715, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 1.3134453296661377, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8009828329086304, + "step": 2294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999687671661377 + }, + { + "episode": 36736, + "epoch": 0.660315634324334, + "loss/policy_avg": 0.148433655500412, + "lr": 2.5600076687116567e-06, + "objective/entropy": 152.937255859375, + "objective/kl": 17.217933654785156, + "objective/non_score_reward": -1.7217931747436523, + "objective/rlhf_reward": -5.330913155284479, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 76.15084838867188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7016232013702393, + "step": 2295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986050128936768 + }, + { + "episode": 36752, + "epoch": 0.6606032282417227, + "loss/policy_avg": 0.6339513659477234, + "lr": 2.5598159509202455e-06, + "objective/entropy": 89.31312561035156, + "objective/kl": 18.98331642150879, + "objective/non_score_reward": -1.898331642150879, + "objective/rlhf_reward": -7.19332624077797, + "objective/scores": 0.1, + "policy/approxkl_avg": 37.01063919067383, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5004596710205078, + "step": 2296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980783462524414 + }, + { + "episode": 36768, + "epoch": 0.6608908221591113, + "loss/policy_avg": 0.32428914308547974, + "lr": 2.5596242331288347e-06, + "objective/entropy": 208.98477172851562, + "objective/kl": 23.732393264770508, + "objective/non_score_reward": -2.373239517211914, + "objective/rlhf_reward": -11.492958068847656, + "objective/scores": -0.5, + "policy/approxkl_avg": 90.21807098388672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7058777213096619, + "step": 2297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9969258308410645 + }, + { + "episode": 36784, + "epoch": 0.6611784160765, + "loss/policy_avg": 0.426738977432251, + "lr": 2.5594325153374235e-06, + "objective/entropy": 59.808349609375, + "objective/kl": 18.861650466918945, + "objective/non_score_reward": -1.886164903640747, + "objective/rlhf_reward": -4.620940540672514, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 51.7357177734375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7321165204048157, + "step": 2298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9962069988250732 + }, + { + "episode": 36800, + "epoch": 0.6614660099938886, + "loss/policy_avg": -0.3187381327152252, + "lr": 2.5592407975460123e-06, + "objective/entropy": 113.80389404296875, + "objective/kl": 17.378707885742188, + "objective/non_score_reward": -1.737870693206787, + "objective/rlhf_reward": -4.551482772827148, + "objective/scores": 0.6, + "policy/approxkl_avg": 18.090682983398438, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6032657623291016, + "step": 2299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001037120819092 + }, + { + "episode": 36816, + "epoch": 0.6617536039112772, + "loss/policy_avg": 0.1893741935491562, + "lr": 2.5590490797546016e-06, + "objective/entropy": 88.16151428222656, + "objective/kl": 18.612485885620117, + "objective/non_score_reward": -1.86124849319458, + "objective/rlhf_reward": -9.44499397277832, + "objective/scores": -0.5, + "policy/approxkl_avg": 74.23924255371094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6313012838363647, + "step": 2300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00156831741333 + }, + { + "episode": 36832, + "epoch": 0.6620411978286659, + "loss/policy_avg": -0.07124347239732742, + "lr": 2.55885736196319e-06, + "objective/entropy": 3.6212921142578125, + "objective/kl": 16.59735107421875, + "objective/non_score_reward": -1.6597352027893066, + "objective/rlhf_reward": -8.638940811157227, + "objective/scores": -0.5, + "policy/approxkl_avg": 2.6608316898345947, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.586020827293396, + "step": 2301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000122547149658 + }, + { + "episode": 36848, + "epoch": 0.6623287917460545, + "loss/policy_avg": -0.32903608679771423, + "lr": 2.558665644171779e-06, + "objective/entropy": 4.168998718261719, + "objective/kl": 16.042404174804688, + "objective/non_score_reward": -1.6042404174804688, + "objective/rlhf_reward": -4.016961371898651, + "objective/scores": 0.6, + "policy/approxkl_avg": 20.106163024902344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6971254944801331, + "step": 2302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00260066986084 + }, + { + "episode": 36864, + "epoch": 0.6626163856634432, + "loss/policy_avg": 0.420703649520874, + "lr": 2.558473926380368e-06, + "objective/entropy": 203.9630889892578, + "objective/kl": 16.670202255249023, + "objective/non_score_reward": -1.6670202016830444, + "objective/rlhf_reward": -6.2680810451507565, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.04973602294922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.749180793762207, + "step": 2303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9969289302825928 + }, + { + "episode": 36880, + "epoch": 0.6629039795808319, + "loss/policy_avg": 0.05179433524608612, + "lr": 2.558282208588957e-06, + "objective/entropy": 0.6831283569335938, + "objective/kl": 18.128053665161133, + "objective/non_score_reward": -1.812805414199829, + "objective/rlhf_reward": -5.303810517268117, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 13.856398582458496, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 1.0042017698287964, + "step": 2304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989118576049805 + }, + { + "episode": 36896, + "epoch": 0.6631915734982206, + "loss/policy_avg": 0.2960914075374603, + "lr": 2.558090490797546e-06, + "objective/entropy": 83.56949615478516, + "objective/kl": 17.618938446044922, + "objective/non_score_reward": -1.7618937492370605, + "objective/rlhf_reward": -9.047574996948242, + "objective/scores": -0.5, + "policy/approxkl_avg": 8.121335983276367, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5466283559799194, + "step": 2305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000248908996582 + }, + { + "episode": 36912, + "epoch": 0.6634791674156092, + "loss/policy_avg": 0.17726615071296692, + "lr": 2.557898773006135e-06, + "objective/entropy": 62.649749755859375, + "objective/kl": 14.863105773925781, + "objective/non_score_reward": -1.4863104820251465, + "objective/rlhf_reward": -1.5452419877052304, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.797748565673828, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7958052158355713, + "step": 2306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999192237854004 + }, + { + "episode": 36928, + "epoch": 0.6637667613329978, + "loss/policy_avg": 0.27799302339553833, + "lr": 2.557707055214724e-06, + "objective/entropy": -89.15963745117188, + "objective/kl": 12.794493675231934, + "objective/non_score_reward": -1.279449462890625, + "objective/rlhf_reward": -2.194078732968542, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.691387176513672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5591141581535339, + "step": 2307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9968161582946777 + }, + { + "episode": 36944, + "epoch": 0.6640543552503865, + "loss/policy_avg": 0.22466981410980225, + "lr": 2.557515337423313e-06, + "objective/entropy": 128.76828002929688, + "objective/kl": 16.3138484954834, + "objective/non_score_reward": -1.6313848495483398, + "objective/rlhf_reward": -2.125539308786392, + "objective/scores": 1.1, + "policy/approxkl_avg": 32.673011779785156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7959351539611816, + "step": 2308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9986681938171387 + }, + { + "episode": 36960, + "epoch": 0.6643419491677751, + "loss/policy_avg": 0.15630009770393372, + "lr": 2.5573236196319017e-06, + "objective/entropy": -250.48313903808594, + "objective/kl": 13.846506118774414, + "objective/non_score_reward": -1.384650707244873, + "objective/rlhf_reward": -2.6148838742983074, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 10.164833068847656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5223649740219116, + "step": 2309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981579780578613 + }, + { + "episode": 36976, + "epoch": 0.6646295430851638, + "loss/policy_avg": 0.0861392617225647, + "lr": 2.557131901840491e-06, + "objective/entropy": 149.0970916748047, + "objective/kl": 15.741743087768555, + "objective/non_score_reward": -1.574174165725708, + "objective/rlhf_reward": -3.896696931123733, + "objective/scores": 0.6, + "policy/approxkl_avg": 103.422119140625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8788071870803833, + "step": 2310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999258518218994 + }, + { + "episode": 36992, + "epoch": 0.6649171370025524, + "loss/policy_avg": 0.13818402588367462, + "lr": 2.5569401840490797e-06, + "objective/entropy": 198.20147705078125, + "objective/kl": 22.796815872192383, + "objective/non_score_reward": -2.279681444168091, + "objective/rlhf_reward": -6.195007000805113, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 46.45262145996094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4907756447792053, + "step": 2311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9959955215454102 + }, + { + "episode": 37008, + "epoch": 0.665204730919941, + "loss/policy_avg": 0.10368118435144424, + "lr": 2.556748466257669e-06, + "objective/entropy": 5.179817199707031, + "objective/kl": 16.019678115844727, + "objective/non_score_reward": -1.6019678115844727, + "objective/rlhf_reward": -2.0078711867332455, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.488909363746643, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7089933156967163, + "step": 2312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0010156631469727 + }, + { + "episode": 37024, + "epoch": 0.6654923248373297, + "loss/policy_avg": 0.5133357644081116, + "lr": 2.5565567484662578e-06, + "objective/entropy": -228.21189880371094, + "objective/kl": 11.758785247802734, + "objective/non_score_reward": -1.1758785247802734, + "objective/rlhf_reward": -0.3035142779350277, + "objective/scores": 1.1, + "policy/approxkl_avg": 10.367254257202148, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8332253694534302, + "step": 2313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0001230239868164 + }, + { + "episode": 37040, + "epoch": 0.6657799187547183, + "loss/policy_avg": 1.2695897817611694, + "lr": 2.5563650306748466e-06, + "objective/entropy": 69.35960388183594, + "objective/kl": 14.953081130981445, + "objective/non_score_reward": -1.4953081607818604, + "objective/rlhf_reward": -7.9812331199646, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.81647491455078, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7310347557067871, + "step": 2314, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9994488954544067 + }, + { + "episode": 37056, + "epoch": 0.666067512672107, + "loss/policy_avg": 0.3404615521430969, + "lr": 2.556173312883436e-06, + "objective/entropy": 55.503807067871094, + "objective/kl": 19.9896240234375, + "objective/non_score_reward": -1.998962163925171, + "objective/rlhf_reward": -6.391729030672627, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 13.813854217529297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6280677914619446, + "step": 2315, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997502326965332 + }, + { + "episode": 37072, + "epoch": 0.6663551065894956, + "loss/policy_avg": 0.26698288321495056, + "lr": 2.5559815950920246e-06, + "objective/entropy": 66.94442749023438, + "objective/kl": 11.107053756713867, + "objective/non_score_reward": -1.1107053756713867, + "objective/rlhf_reward": -6.442821502685547, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.230381011962891, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6185259819030762, + "step": 2316, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984166622161865 + }, + { + "episode": 37088, + "epoch": 0.6666427005068842, + "loss/policy_avg": 0.272176593542099, + "lr": 2.5557898773006134e-06, + "objective/entropy": -25.308387756347656, + "objective/kl": 13.86503791809082, + "objective/non_score_reward": -1.3865039348602295, + "objective/rlhf_reward": -5.146015635132789, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.04141235351562, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.708859920501709, + "step": 2317, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0056681632995605 + }, + { + "episode": 37104, + "epoch": 0.6669302944242729, + "loss/policy_avg": 0.4910784959793091, + "lr": 2.5555981595092027e-06, + "objective/entropy": 175.00706481933594, + "objective/kl": 13.743255615234375, + "objective/non_score_reward": -1.3743257522583008, + "objective/rlhf_reward": -5.097303009033203, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.28903579711914, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4330393671989441, + "step": 2318, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0001161098480225 + }, + { + "episode": 37120, + "epoch": 0.6672178883416616, + "loss/policy_avg": 0.3694079518318176, + "lr": 2.5554064417177915e-06, + "objective/entropy": 148.99612426757812, + "objective/kl": 19.65071678161621, + "objective/non_score_reward": -1.965071678161621, + "objective/rlhf_reward": -9.860286712646484, + "objective/scores": -0.5, + "policy/approxkl_avg": 144.99139404296875, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5872672200202942, + "step": 2319, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999612808227539 + }, + { + "episode": 37136, + "epoch": 0.6675054822590503, + "loss/policy_avg": 0.3520659804344177, + "lr": 2.5552147239263807e-06, + "objective/entropy": 141.15826416015625, + "objective/kl": 17.581241607666016, + "objective/non_score_reward": -1.7581241130828857, + "objective/rlhf_reward": -5.085085282998021, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 29.694534301757812, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3657630980014801, + "step": 2320, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991331100463867 + }, + { + "episode": 37152, + "epoch": 0.6677930761764389, + "loss/policy_avg": 0.11424215137958527, + "lr": 2.5550230061349695e-06, + "objective/entropy": 117.16325378417969, + "objective/kl": 15.977231979370117, + "objective/non_score_reward": -1.5977232456207275, + "objective/rlhf_reward": -8.39089298248291, + "objective/scores": -0.5, + "policy/approxkl_avg": 38.100990295410156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.598469614982605, + "step": 2321, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9974943399429321 + }, + { + "episode": 37168, + "epoch": 0.6680806700938275, + "loss/policy_avg": 0.06039281189441681, + "lr": 2.5548312883435583e-06, + "objective/entropy": 173.42393493652344, + "objective/kl": 18.59466552734375, + "objective/non_score_reward": -1.859466552734375, + "objective/rlhf_reward": -3.0378662109374996, + "objective/scores": 1.1, + "policy/approxkl_avg": 83.97549438476562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7640442848205566, + "step": 2322, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.995422601699829 + }, + { + "episode": 37184, + "epoch": 0.6683682640112162, + "loss/policy_avg": 0.4325703978538513, + "lr": 2.554639570552147e-06, + "objective/entropy": 101.97483825683594, + "objective/kl": 18.731670379638672, + "objective/non_score_reward": -1.8731671571731567, + "objective/rlhf_reward": -4.568949614406797, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 98.36922454833984, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6910192370414734, + "step": 2323, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999758005142212 + }, + { + "episode": 37200, + "epoch": 0.6686558579286048, + "loss/policy_avg": 0.013559557497501373, + "lr": 2.554447852760736e-06, + "objective/entropy": 123.21492767333984, + "objective/kl": 15.45811653137207, + "objective/non_score_reward": -1.5458115339279175, + "objective/rlhf_reward": -5.783246254920959, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.97567367553711, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6617206335067749, + "step": 2324, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0010128021240234 + }, + { + "episode": 37216, + "epoch": 0.6689434518459935, + "loss/policy_avg": 0.011508099734783173, + "lr": 2.554256134969325e-06, + "objective/entropy": 256.7122802734375, + "objective/kl": 17.93226432800293, + "objective/non_score_reward": -1.7932264804840088, + "objective/rlhf_reward": -6.772906160354614, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.55018424987793, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8229022026062012, + "step": 2325, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000396251678467 + }, + { + "episode": 37232, + "epoch": 0.6692310457633821, + "loss/policy_avg": 0.22444044053554535, + "lr": 2.554064417177914e-06, + "objective/entropy": 158.66749572753906, + "objective/kl": 13.320199966430664, + "objective/non_score_reward": -1.3320200443267822, + "objective/rlhf_reward": -7.328080177307129, + "objective/scores": -0.5, + "policy/approxkl_avg": 49.490081787109375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6645934581756592, + "step": 2326, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004334449768066 + }, + { + "episode": 37248, + "epoch": 0.6695186396807707, + "loss/policy_avg": 0.24487340450286865, + "lr": 2.5538726993865032e-06, + "objective/entropy": 148.02096557617188, + "objective/kl": 23.824968338012695, + "objective/non_score_reward": -2.3824968338012695, + "objective/rlhf_reward": -11.529987335205078, + "objective/scores": -0.5, + "policy/approxkl_avg": 41.53895950317383, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6445210576057434, + "step": 2327, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9995183944702148 + }, + { + "episode": 37264, + "epoch": 0.6698062335981594, + "loss/policy_avg": 0.021379921585321426, + "lr": 2.553680981595092e-06, + "objective/entropy": -103.62948608398438, + "objective/kl": 13.832340240478516, + "objective/non_score_reward": -1.3832340240478516, + "objective/rlhf_reward": -3.5855251056718185, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.3906583786010742, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6477087140083313, + "step": 2328, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0015311241149902 + }, + { + "episode": 37280, + "epoch": 0.670093827515548, + "loss/policy_avg": 0.49968069791793823, + "lr": 2.553489263803681e-06, + "objective/entropy": 59.137794494628906, + "objective/kl": 18.70618438720703, + "objective/non_score_reward": -1.8706185817718506, + "objective/rlhf_reward": -7.082474192976951, + "objective/scores": 0.1, + "policy/approxkl_avg": 209.8355255126953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.597786545753479, + "step": 2329, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9967823028564453 + }, + { + "episode": 37296, + "epoch": 0.6703814214329367, + "loss/policy_avg": 0.4317970275878906, + "lr": 2.55329754601227e-06, + "objective/entropy": 15.281234741210938, + "objective/kl": 8.350692749023438, + "objective/non_score_reward": -0.8350692391395569, + "objective/rlhf_reward": -5.340276718139648, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.626282691955566, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7200409173965454, + "step": 2330, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997509479522705 + }, + { + "episode": 37312, + "epoch": 0.6706690153503253, + "loss/policy_avg": 2.48857045173645, + "lr": 2.553105828220859e-06, + "objective/entropy": 10.84235954284668, + "objective/kl": 15.428094863891602, + "objective/non_score_reward": -1.5428093671798706, + "objective/rlhf_reward": -1.7712374091148373, + "objective/scores": 1.1, + "policy/approxkl_avg": 38.43216323852539, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7159597873687744, + "step": 2331, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0019233226776123 + }, + { + "episode": 37328, + "epoch": 0.6709566092677139, + "loss/policy_avg": 0.27047663927078247, + "lr": 2.5529141104294477e-06, + "objective/entropy": 1.0194292068481445, + "objective/kl": 19.12179946899414, + "objective/non_score_reward": -1.9121801853179932, + "objective/rlhf_reward": -5.70130969114774, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 67.82426452636719, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4647403359413147, + "step": 2332, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991931915283203 + }, + { + "episode": 37344, + "epoch": 0.6712442031851026, + "loss/policy_avg": 0.18591710925102234, + "lr": 2.552722392638037e-06, + "objective/entropy": -36.61052703857422, + "objective/kl": 18.88507843017578, + "objective/non_score_reward": -1.8885078430175781, + "objective/rlhf_reward": -7.154031640291214, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.58317565917969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5592823624610901, + "step": 2333, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9964879751205444 + }, + { + "episode": 37360, + "epoch": 0.6715317971024913, + "loss/policy_avg": 0.05644374340772629, + "lr": 2.5525306748466257e-06, + "objective/entropy": 63.062744140625, + "objective/kl": 17.671581268310547, + "objective/non_score_reward": -1.7671581506729126, + "objective/rlhf_reward": -6.668632781505584, + "objective/scores": 0.1, + "policy/approxkl_avg": 38.121299743652344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8344275951385498, + "step": 2334, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9961326122283936 + }, + { + "episode": 37376, + "epoch": 0.67181939101988, + "loss/policy_avg": 0.17964068055152893, + "lr": 2.552338957055215e-06, + "objective/entropy": 50.110347747802734, + "objective/kl": 12.703283309936523, + "objective/non_score_reward": -1.270328164100647, + "objective/rlhf_reward": -2.681312835216522, + "objective/scores": 0.6, + "policy/approxkl_avg": 9.324361801147461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6545754671096802, + "step": 2335, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004725456237793 + }, + { + "episode": 37392, + "epoch": 0.6721069849372686, + "loss/policy_avg": 0.18911662697792053, + "lr": 2.552147239263804e-06, + "objective/entropy": 197.40760803222656, + "objective/kl": 16.612634658813477, + "objective/non_score_reward": -1.6612634658813477, + "objective/rlhf_reward": -4.911720425883928, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 58.14643096923828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.634497880935669, + "step": 2336, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985204935073853 + }, + { + "episode": 37408, + "epoch": 0.6723945788546573, + "loss/policy_avg": 0.4584101736545563, + "lr": 2.5519555214723926e-06, + "objective/entropy": 149.51400756835938, + "objective/kl": 16.61104965209961, + "objective/non_score_reward": -1.6611049175262451, + "objective/rlhf_reward": -5.040299389425831, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 62.66534423828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7145014405250549, + "step": 2337, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983527660369873 + }, + { + "episode": 37424, + "epoch": 0.6726821727720459, + "loss/policy_avg": 0.6171430349349976, + "lr": 2.551763803680982e-06, + "objective/entropy": -9.677066802978516, + "objective/kl": 13.620866775512695, + "objective/non_score_reward": -1.3620866537094116, + "objective/rlhf_reward": -3.048346465826034, + "objective/scores": 0.6, + "policy/approxkl_avg": 44.04124450683594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6463913917541504, + "step": 2338, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975409507751465 + }, + { + "episode": 37440, + "epoch": 0.6729697666894345, + "loss/policy_avg": 0.1318362057209015, + "lr": 2.5515720858895706e-06, + "objective/entropy": 38.19598388671875, + "objective/kl": 19.960039138793945, + "objective/non_score_reward": -1.9960038661956787, + "objective/rlhf_reward": -7.5840154647827145, + "objective/scores": 0.1, + "policy/approxkl_avg": 4.450936794281006, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5201414823532104, + "step": 2339, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000936269760132 + }, + { + "episode": 37456, + "epoch": 0.6732573606068232, + "loss/policy_avg": -0.4302183985710144, + "lr": 2.55138036809816e-06, + "objective/entropy": -12.776840209960938, + "objective/kl": 11.0087890625, + "objective/non_score_reward": -1.1008789539337158, + "objective/rlhf_reward": -6.403515815734863, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.597243309020996, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.7018436789512634, + "step": 2340, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0039219856262207 + }, + { + "episode": 37472, + "epoch": 0.6735449545242118, + "loss/policy_avg": 0.1691046804189682, + "lr": 2.5511886503067487e-06, + "objective/entropy": 159.7733917236328, + "objective/kl": 11.087263107299805, + "objective/non_score_reward": -1.1087265014648438, + "objective/rlhf_reward": -6.434906005859375, + "objective/scores": -0.5, + "policy/approxkl_avg": 16.17793083190918, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.43710124492645264, + "step": 2341, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960179328918457 + }, + { + "episode": 37488, + "epoch": 0.6738325484416005, + "loss/policy_avg": 0.35037779808044434, + "lr": 2.5509969325153375e-06, + "objective/entropy": 56.622718811035156, + "objective/kl": 17.291534423828125, + "objective/non_score_reward": -1.7291532754898071, + "objective/rlhf_reward": -2.5166131317615505, + "objective/scores": 1.1, + "policy/approxkl_avg": 29.37803077697754, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3321661353111267, + "step": 2342, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989839792251587 + }, + { + "episode": 37504, + "epoch": 0.6741201423589891, + "loss/policy_avg": -0.08184777200222015, + "lr": 2.5508052147239267e-06, + "objective/entropy": -168.5766143798828, + "objective/kl": 16.240413665771484, + "objective/non_score_reward": -1.6240415573120117, + "objective/rlhf_reward": -6.096166288852691, + "objective/scores": 0.1, + "policy/approxkl_avg": 54.85392761230469, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7052419781684875, + "step": 2343, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971129894256592 + }, + { + "episode": 37520, + "epoch": 0.6744077362763777, + "loss/policy_avg": -0.20576009154319763, + "lr": 2.5506134969325155e-06, + "objective/entropy": -11.535301208496094, + "objective/kl": 15.220268249511719, + "objective/non_score_reward": -1.522026777267456, + "objective/rlhf_reward": -1.6881071090698239, + "objective/scores": 1.1, + "policy/approxkl_avg": 15.641721725463867, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.47453761100769043, + "step": 2344, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9983747005462646 + }, + { + "episode": 37536, + "epoch": 0.6746953301937664, + "loss/policy_avg": 0.7535731792449951, + "lr": 2.5504217791411043e-06, + "objective/entropy": 97.59146118164062, + "objective/kl": 12.347482681274414, + "objective/non_score_reward": -1.2347482442855835, + "objective/rlhf_reward": -2.8162866256394725, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 5.028593063354492, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5101989507675171, + "step": 2345, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0025951862335205 + }, + { + "episode": 37552, + "epoch": 0.674982924111155, + "loss/policy_avg": 0.07940283417701721, + "lr": 2.550230061349693e-06, + "objective/entropy": 21.05797576904297, + "objective/kl": 9.749893188476562, + "objective/non_score_reward": -0.9749892354011536, + "objective/rlhf_reward": -3.4999567627906796, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.4519693851470947, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6360624432563782, + "step": 2346, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000640869140625 + }, + { + "episode": 37568, + "epoch": 0.6752705180285437, + "loss/policy_avg": 0.5512768030166626, + "lr": 2.550038343558282e-06, + "objective/entropy": 23.716659545898438, + "objective/kl": 17.07099151611328, + "objective/non_score_reward": -1.707099199295044, + "objective/rlhf_reward": -8.82839584350586, + "objective/scores": -0.5, + "policy/approxkl_avg": 37.055694580078125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5303549766540527, + "step": 2347, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996858835220337 + }, + { + "episode": 37584, + "epoch": 0.6755581119459323, + "loss/policy_avg": 1.214499831199646, + "lr": 2.549846625766871e-06, + "objective/entropy": -126.47698974609375, + "objective/kl": 13.222586631774902, + "objective/non_score_reward": -1.322258710861206, + "objective/rlhf_reward": -0.8890346050262448, + "objective/scores": 1.1, + "policy/approxkl_avg": 3.0465681552886963, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.7465829849243164, + "step": 2348, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9995687007904053 + }, + { + "episode": 37600, + "epoch": 0.675845705863321, + "loss/policy_avg": 0.1287229359149933, + "lr": 2.54965490797546e-06, + "objective/entropy": -96.94557189941406, + "objective/kl": 17.62995147705078, + "objective/non_score_reward": -1.7629950046539307, + "objective/rlhf_reward": -6.651980257034301, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.454124450683594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8209439516067505, + "step": 2349, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9989672899246216 + }, + { + "episode": 37616, + "epoch": 0.6761332997807097, + "loss/policy_avg": -0.0770440399646759, + "lr": 2.5494631901840492e-06, + "objective/entropy": -243.44192504882812, + "objective/kl": 14.587154388427734, + "objective/non_score_reward": -1.458715558052063, + "objective/rlhf_reward": -3.88745106287473, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 8.927319526672363, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6826550960540771, + "step": 2350, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.998389720916748 + }, + { + "episode": 37632, + "epoch": 0.6764208936980983, + "loss/policy_avg": 0.21553274989128113, + "lr": 2.549271472392638e-06, + "objective/entropy": 9.667755126953125, + "objective/kl": 17.31271743774414, + "objective/non_score_reward": -1.731271743774414, + "objective/rlhf_reward": -4.802380504385505, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 57.263301849365234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6656813621520996, + "step": 2351, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.000150203704834 + }, + { + "episode": 37648, + "epoch": 0.676708487615487, + "loss/policy_avg": -0.047590918838977814, + "lr": 2.549079754601227e-06, + "objective/entropy": 90.6687240600586, + "objective/kl": 14.7943696975708, + "objective/non_score_reward": -1.479436993598938, + "objective/rlhf_reward": -5.517748034000396, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.7691826820373535, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6057709455490112, + "step": 2352, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000667572021484 + }, + { + "episode": 37664, + "epoch": 0.6769960815328756, + "loss/policy_avg": 0.8578144311904907, + "lr": 2.548888036809816e-06, + "objective/entropy": 70.19683837890625, + "objective/kl": 18.876052856445312, + "objective/non_score_reward": -1.8876051902770996, + "objective/rlhf_reward": -3.150420686602592, + "objective/scores": 1.1, + "policy/approxkl_avg": 41.112037658691406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5748271942138672, + "step": 2353, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000706195831299 + }, + { + "episode": 37680, + "epoch": 0.6772836754502642, + "loss/policy_avg": 7.543760776519775, + "lr": 2.548696319018405e-06, + "objective/entropy": 126.7467041015625, + "objective/kl": 10.020503997802734, + "objective/non_score_reward": -1.0020503997802734, + "objective/rlhf_reward": -3.6082015097141262, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.69321608543396, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5338419675827026, + "step": 2354, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.015646457672119 + }, + { + "episode": 37696, + "epoch": 0.6775712693676529, + "loss/policy_avg": 0.4916594624519348, + "lr": 2.5485046012269937e-06, + "objective/entropy": 15.650882720947266, + "objective/kl": 16.535350799560547, + "objective/non_score_reward": -1.653535008430481, + "objective/rlhf_reward": -8.614140510559082, + "objective/scores": -0.5, + "policy/approxkl_avg": 42.67940902709961, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46649202704429626, + "step": 2355, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998720407485962 + }, + { + "episode": 37712, + "epoch": 0.6778588632850415, + "loss/policy_avg": 0.3172226846218109, + "lr": 2.548312883435583e-06, + "objective/entropy": 265.034912109375, + "objective/kl": 17.4699764251709, + "objective/non_score_reward": -1.746997594833374, + "objective/rlhf_reward": -6.587990260124206, + "objective/scores": 0.1, + "policy/approxkl_avg": 33.108619689941406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7924455404281616, + "step": 2356, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985918998718262 + }, + { + "episode": 37728, + "epoch": 0.6781464572024302, + "loss/policy_avg": 0.11512107402086258, + "lr": 2.5481211656441718e-06, + "objective/entropy": 207.16583251953125, + "objective/kl": 22.506271362304688, + "objective/non_score_reward": -2.250627040863037, + "objective/rlhf_reward": -8.602507925033569, + "objective/scores": 0.1, + "policy/approxkl_avg": 48.099510192871094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5274972915649414, + "step": 2357, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984009265899658 + }, + { + "episode": 37744, + "epoch": 0.6784340511198188, + "loss/policy_avg": -0.010802611708641052, + "lr": 2.547929447852761e-06, + "objective/entropy": -104.96611022949219, + "objective/kl": 11.402233123779297, + "objective/non_score_reward": -1.1402233839035034, + "objective/rlhf_reward": -6.560893535614014, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.0427619218826294, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.597365140914917, + "step": 2358, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0010321140289307 + }, + { + "episode": 37760, + "epoch": 0.6787216450372074, + "loss/policy_avg": 0.5463196039199829, + "lr": 2.54773773006135e-06, + "objective/entropy": 110.8509292602539, + "objective/kl": 20.538684844970703, + "objective/non_score_reward": -2.0538687705993652, + "objective/rlhf_reward": -10.215475082397461, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.8312873840332, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4094330072402954, + "step": 2359, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977003335952759 + }, + { + "episode": 37776, + "epoch": 0.6790092389545961, + "loss/policy_avg": 0.30159619450569153, + "lr": 2.5475460122699386e-06, + "objective/entropy": -3.8185462951660156, + "objective/kl": 15.697071075439453, + "objective/non_score_reward": -1.5697071552276611, + "objective/rlhf_reward": -5.878828680515289, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.05399703979492, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6199531555175781, + "step": 2360, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9988099336624146 + }, + { + "episode": 37792, + "epoch": 0.6792968328719847, + "loss/policy_avg": 0.5906786918640137, + "lr": 2.547354294478528e-06, + "objective/entropy": -6.215213775634766, + "objective/kl": 11.929386138916016, + "objective/non_score_reward": -1.1929385662078857, + "objective/rlhf_reward": -4.37175435423851, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.593338012695312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7729129791259766, + "step": 2361, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.997424602508545 + }, + { + "episode": 37808, + "epoch": 0.6795844267893734, + "loss/policy_avg": 0.7287322282791138, + "lr": 2.5471625766871166e-06, + "objective/entropy": -13.709220886230469, + "objective/kl": 10.416068077087402, + "objective/non_score_reward": -1.0416066646575928, + "objective/rlhf_reward": -3.7664268821477886, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.55803108215332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7005133628845215, + "step": 2362, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9963767528533936 + }, + { + "episode": 37824, + "epoch": 0.679872020706762, + "loss/policy_avg": 0.5065807104110718, + "lr": 2.546970858895706e-06, + "objective/entropy": 165.01046752929688, + "objective/kl": 18.968791961669922, + "objective/non_score_reward": -1.8968794345855713, + "objective/rlhf_reward": -3.1875179469585415, + "objective/scores": 1.1, + "policy/approxkl_avg": 24.66299057006836, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7943272590637207, + "step": 2363, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9983073472976685 + }, + { + "episode": 37840, + "epoch": 0.6801596146241508, + "loss/policy_avg": -0.09428460896015167, + "lr": 2.5467791411042947e-06, + "objective/entropy": 57.868064880371094, + "objective/kl": 19.484859466552734, + "objective/non_score_reward": -1.9484859704971313, + "objective/rlhf_reward": -5.393944060802459, + "objective/scores": 0.6, + "policy/approxkl_avg": 38.548187255859375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4131024479866028, + "step": 2364, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998547911643982 + }, + { + "episode": 37856, + "epoch": 0.6804472085415394, + "loss/policy_avg": 0.014633553102612495, + "lr": 2.5465874233128835e-06, + "objective/entropy": 11.214363098144531, + "objective/kl": 15.86156940460205, + "objective/non_score_reward": -1.5861570835113525, + "objective/rlhf_reward": -5.944628274440765, + "objective/scores": 0.1, + "policy/approxkl_avg": 8.504777908325195, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5623284578323364, + "step": 2365, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9983117580413818 + }, + { + "episode": 37872, + "epoch": 0.680734802458928, + "loss/policy_avg": -0.0003791128983721137, + "lr": 2.5463957055214727e-06, + "objective/entropy": 114.24466705322266, + "objective/kl": 17.914379119873047, + "objective/non_score_reward": -1.7914378643035889, + "objective/rlhf_reward": -5.43241848150889, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 6.264281272888184, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6559627056121826, + "step": 2366, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004148483276367 + }, + { + "episode": 37888, + "epoch": 0.6810223963763167, + "loss/policy_avg": 0.2917371690273285, + "lr": 2.5462039877300615e-06, + "objective/entropy": -3.1063995361328125, + "objective/kl": 19.82198715209961, + "objective/non_score_reward": -1.98219895362854, + "objective/rlhf_reward": -7.5287956357002255, + "objective/scores": 0.1, + "policy/approxkl_avg": 146.663330078125, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.6002984046936035, + "step": 2367, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0023727416992188 + }, + { + "episode": 37904, + "epoch": 0.6813099902937053, + "loss/policy_avg": 0.2226359248161316, + "lr": 2.5460122699386504e-06, + "objective/entropy": -102.39707946777344, + "objective/kl": 19.871543884277344, + "objective/non_score_reward": -1.987154483795166, + "objective/rlhf_reward": -3.5486179426312443, + "objective/scores": 1.1, + "policy/approxkl_avg": 35.11748123168945, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7066978216171265, + "step": 2368, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.00105619430542 + }, + { + "episode": 37920, + "epoch": 0.681597584211094, + "loss/policy_avg": -0.22409255802631378, + "lr": 2.545820552147239e-06, + "objective/entropy": 28.46088409423828, + "objective/kl": 12.455662727355957, + "objective/non_score_reward": -1.2455663681030273, + "objective/rlhf_reward": -0.5822652339935299, + "objective/scores": 1.1, + "policy/approxkl_avg": 22.929950714111328, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.4477207660675049, + "step": 2369, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993529319763184 + }, + { + "episode": 37936, + "epoch": 0.6818851781284826, + "loss/policy_avg": 0.5391968488693237, + "lr": 2.545628834355828e-06, + "objective/entropy": 200.6731719970703, + "objective/kl": 13.703605651855469, + "objective/non_score_reward": -1.3703603744506836, + "objective/rlhf_reward": -1.081441795825958, + "objective/scores": 1.1, + "policy/approxkl_avg": 22.747711181640625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7975244522094727, + "step": 2370, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000143051147461 + }, + { + "episode": 37952, + "epoch": 0.6821727720458712, + "loss/policy_avg": 0.4769810736179352, + "lr": 2.545437116564417e-06, + "objective/entropy": 64.26600646972656, + "objective/kl": 18.710128784179688, + "objective/non_score_reward": -1.8710130453109741, + "objective/rlhf_reward": -5.53664095230573, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 182.27891540527344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5450712442398071, + "step": 2371, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994101524353027 + }, + { + "episode": 37968, + "epoch": 0.6824603659632599, + "loss/policy_avg": -0.10944107174873352, + "lr": 2.545245398773006e-06, + "objective/entropy": -144.7941436767578, + "objective/kl": 12.572160720825195, + "objective/non_score_reward": -1.2572160959243774, + "objective/rlhf_reward": -3.2955309907595316, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 27.950170516967773, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5846720933914185, + "step": 2372, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0016770362854004 + }, + { + "episode": 37984, + "epoch": 0.6827479598806485, + "loss/policy_avg": 0.490910142660141, + "lr": 2.5450536809815952e-06, + "objective/entropy": -3.4798126220703125, + "objective/kl": 16.669143676757812, + "objective/non_score_reward": -1.6669142246246338, + "objective/rlhf_reward": -4.5449509940305095, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 17.67532730102539, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8314906358718872, + "step": 2373, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999407529830933 + }, + { + "episode": 38000, + "epoch": 0.6830355537980372, + "loss/policy_avg": -0.02355530858039856, + "lr": 2.544861963190184e-06, + "objective/entropy": 2.0637435913085938, + "objective/kl": 15.970002174377441, + "objective/non_score_reward": -1.597000241279602, + "objective/rlhf_reward": -4.563172216686319, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 41.44702911376953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.43509942293167114, + "step": 2374, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9991532564163208 + }, + { + "episode": 38016, + "epoch": 0.6833231477154258, + "loss/policy_avg": 0.5410069227218628, + "lr": 2.544670245398773e-06, + "objective/entropy": -88.23371887207031, + "objective/kl": 19.367130279541016, + "objective/non_score_reward": -1.9367129802703857, + "objective/rlhf_reward": -4.823133324028227, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 18.989822387695312, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4978526532649994, + "step": 2375, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9983242750167847 + }, + { + "episode": 38032, + "epoch": 0.6836107416328144, + "loss/policy_avg": 0.07011735439300537, + "lr": 2.544478527607362e-06, + "objective/entropy": -34.97230529785156, + "objective/kl": 11.747721672058105, + "objective/non_score_reward": -1.1747722625732422, + "objective/rlhf_reward": -4.299088871479034, + "objective/scores": 0.1, + "policy/approxkl_avg": 15.02751636505127, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6224102973937988, + "step": 2376, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0016679763793945 + }, + { + "episode": 38048, + "epoch": 0.6838983355502031, + "loss/policy_avg": 0.23295675218105316, + "lr": 2.544286809815951e-06, + "objective/entropy": -29.86457061767578, + "objective/kl": 14.828042984008789, + "objective/non_score_reward": -1.4828044176101685, + "objective/rlhf_reward": -5.531217566132545, + "objective/scores": 0.1, + "policy/approxkl_avg": 80.19185638427734, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.647622287273407, + "step": 2377, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997412919998169 + }, + { + "episode": 38064, + "epoch": 0.6841859294675917, + "loss/policy_avg": 0.24506890773773193, + "lr": 2.54409509202454e-06, + "objective/entropy": -49.15517807006836, + "objective/kl": 13.11131477355957, + "objective/non_score_reward": -1.3111315965652466, + "objective/rlhf_reward": -0.8445264458656307, + "objective/scores": 1.1, + "policy/approxkl_avg": 10.21036148071289, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7912728786468506, + "step": 2378, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999809980392456 + }, + { + "episode": 38080, + "epoch": 0.6844735233849804, + "loss/policy_avg": 0.022225193679332733, + "lr": 2.543903374233129e-06, + "objective/entropy": 226.58547973632812, + "objective/kl": 19.322124481201172, + "objective/non_score_reward": -1.9322123527526855, + "objective/rlhf_reward": -7.328849291801452, + "objective/scores": 0.1, + "policy/approxkl_avg": 84.02395629882812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.836097776889801, + "step": 2379, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000347137451172 + }, + { + "episode": 38096, + "epoch": 0.6847611173023691, + "loss/policy_avg": 0.15902036428451538, + "lr": 2.5437116564417178e-06, + "objective/entropy": -104.35630798339844, + "objective/kl": 14.964473724365234, + "objective/non_score_reward": -1.4964473247528076, + "objective/rlhf_reward": -3.0620705827486248, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 42.95732879638672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4826052784919739, + "step": 2380, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9977092742919922 + }, + { + "episode": 38112, + "epoch": 0.6850487112197577, + "loss/policy_avg": 0.17381584644317627, + "lr": 2.543519938650307e-06, + "objective/entropy": 153.31005859375, + "objective/kl": 12.905035018920898, + "objective/non_score_reward": -1.2905036211013794, + "objective/rlhf_reward": -4.762014365196228, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.92567253112793, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6593793630599976, + "step": 2381, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99961256980896 + }, + { + "episode": 38128, + "epoch": 0.6853363051371464, + "loss/policy_avg": -0.15650756657123566, + "lr": 2.543328220858896e-06, + "objective/entropy": -78.62450408935547, + "objective/kl": 16.713642120361328, + "objective/non_score_reward": -1.6713643074035645, + "objective/rlhf_reward": -4.285456991195678, + "objective/scores": 0.6, + "policy/approxkl_avg": 33.79967498779297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7772436738014221, + "step": 2382, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0038373470306396 + }, + { + "episode": 38144, + "epoch": 0.685623899054535, + "loss/policy_avg": 0.2791460156440735, + "lr": 2.5431365030674846e-06, + "objective/entropy": 45.02009582519531, + "objective/kl": 14.298861503601074, + "objective/non_score_reward": -1.4298863410949707, + "objective/rlhf_reward": -7.719544887542725, + "objective/scores": -0.5, + "policy/approxkl_avg": 13.945395469665527, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7025325894355774, + "step": 2383, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9980533123016357 + }, + { + "episode": 38160, + "epoch": 0.6859114929719237, + "loss/policy_avg": 0.14568883180618286, + "lr": 2.542944785276074e-06, + "objective/entropy": 195.76580810546875, + "objective/kl": 16.112838745117188, + "objective/non_score_reward": -1.611283779144287, + "objective/rlhf_reward": -3.521416102291319, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 60.78529739379883, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5057171583175659, + "step": 2384, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989773035049438 + }, + { + "episode": 38176, + "epoch": 0.6861990868893123, + "loss/policy_avg": 0.5063943862915039, + "lr": 2.5427530674846627e-06, + "objective/entropy": 165.6365203857422, + "objective/kl": 17.116657257080078, + "objective/non_score_reward": -1.7116656303405762, + "objective/rlhf_reward": -8.846662521362305, + "objective/scores": -0.5, + "policy/approxkl_avg": 44.173545837402344, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6770908832550049, + "step": 2385, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996287822723389 + }, + { + "episode": 38192, + "epoch": 0.6864866808067009, + "loss/policy_avg": 0.1624879390001297, + "lr": 2.542561349693252e-06, + "objective/entropy": 11.813888549804688, + "objective/kl": 10.612668991088867, + "objective/non_score_reward": -1.0612671375274658, + "objective/rlhf_reward": -6.245068550109863, + "objective/scores": -0.5, + "policy/approxkl_avg": 12.712059020996094, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7070558667182922, + "step": 2386, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0033936500549316 + }, + { + "episode": 38208, + "epoch": 0.6867742747240896, + "loss/policy_avg": -0.14744633436203003, + "lr": 2.5423696319018407e-06, + "objective/entropy": 15.351272583007812, + "objective/kl": 20.102500915527344, + "objective/non_score_reward": -2.0102505683898926, + "objective/rlhf_reward": -7.641001945734025, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.63510513305664, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6769348978996277, + "step": 2387, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00319242477417 + }, + { + "episode": 38224, + "epoch": 0.6870618686414782, + "loss/policy_avg": 0.3262820243835449, + "lr": 2.5421779141104295e-06, + "objective/entropy": -229.1365509033203, + "objective/kl": 3.1505727767944336, + "objective/non_score_reward": -0.31505724787712097, + "objective/rlhf_reward": -0.8602290138602258, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.0276970863342285, + "policy/clipfrac_avg": 0.0, + "policy/entropy_avg": 0.4453001916408539, + "step": 2388, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0001306533813477 + }, + { + "episode": 38240, + "epoch": 0.6873494625588669, + "loss/policy_avg": 0.26754438877105713, + "lr": 2.5419861963190187e-06, + "objective/entropy": 255.03985595703125, + "objective/kl": 13.205718994140625, + "objective/non_score_reward": -1.3205718994140625, + "objective/rlhf_reward": -0.8822874784469601, + "objective/scores": 1.1, + "policy/approxkl_avg": 17.32726287841797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6657876372337341, + "step": 2389, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001314640045166 + }, + { + "episode": 38256, + "epoch": 0.6876370564762555, + "loss/policy_avg": 0.3306029438972473, + "lr": 2.541794478527607e-06, + "objective/entropy": -46.668853759765625, + "objective/kl": 17.140121459960938, + "objective/non_score_reward": -1.7140121459960938, + "objective/rlhf_reward": -6.45604852437973, + "objective/scores": 0.1, + "policy/approxkl_avg": 55.39460754394531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8255484700202942, + "step": 2390, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9972751140594482 + }, + { + "episode": 38272, + "epoch": 0.6879246503936441, + "loss/policy_avg": 0.02296963334083557, + "lr": 2.5416027607361964e-06, + "objective/entropy": -19.883819580078125, + "objective/kl": 13.475825309753418, + "objective/non_score_reward": -1.3475825786590576, + "objective/rlhf_reward": -7.3903303146362305, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.257452487945557, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7932485342025757, + "step": 2391, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9987152814865112 + }, + { + "episode": 38288, + "epoch": 0.6882122443110328, + "loss/policy_avg": 0.1291026622056961, + "lr": 2.541411042944785e-06, + "objective/entropy": -122.60946655273438, + "objective/kl": 19.44902801513672, + "objective/non_score_reward": -1.9449028968811035, + "objective/rlhf_reward": -5.656905176416908, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 32.762733459472656, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7750039100646973, + "step": 2392, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.997039794921875 + }, + { + "episode": 38304, + "epoch": 0.6884998382284214, + "loss/policy_avg": 0.4377826452255249, + "lr": 2.541219325153374e-06, + "objective/entropy": 132.87936401367188, + "objective/kl": 17.26844596862793, + "objective/non_score_reward": -1.7268447875976562, + "objective/rlhf_reward": -8.907379150390625, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.914093017578125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5781127214431763, + "step": 2393, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998254776000977 + }, + { + "episode": 38320, + "epoch": 0.6887874321458101, + "loss/policy_avg": 0.4851827025413513, + "lr": 2.541027607361963e-06, + "objective/entropy": 41.502628326416016, + "objective/kl": 16.9117431640625, + "objective/non_score_reward": -1.6911745071411133, + "objective/rlhf_reward": -2.364697879552841, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.00037384033203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7911096811294556, + "step": 2394, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9994444847106934 + }, + { + "episode": 38336, + "epoch": 0.6890750260631988, + "loss/policy_avg": 0.2244480699300766, + "lr": 2.540835889570552e-06, + "objective/entropy": 92.34452819824219, + "objective/kl": 17.634567260742188, + "objective/non_score_reward": -1.7634568214416504, + "objective/rlhf_reward": -5.320493773619333, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 41.524139404296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5093786716461182, + "step": 2395, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998244047164917 + }, + { + "episode": 38352, + "epoch": 0.6893626199805875, + "loss/policy_avg": 0.04284711554646492, + "lr": 2.5406441717791413e-06, + "objective/entropy": 224.9025421142578, + "objective/kl": 17.01390266418457, + "objective/non_score_reward": -1.701390266418457, + "objective/rlhf_reward": -8.805561065673828, + "objective/scores": -0.5, + "policy/approxkl_avg": 28.81897735595703, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6981585025787354, + "step": 2396, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998838901519775 + }, + { + "episode": 38368, + "epoch": 0.6896502138979761, + "loss/policy_avg": 3.3234200477600098, + "lr": 2.54045245398773e-06, + "objective/entropy": 19.97397804260254, + "objective/kl": 20.188440322875977, + "objective/non_score_reward": -2.0188441276550293, + "objective/rlhf_reward": -5.151657138706419, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 16.88092803955078, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5968458652496338, + "step": 2397, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0011932849884033 + }, + { + "episode": 38384, + "epoch": 0.6899378078153647, + "loss/policy_avg": 0.5251247882843018, + "lr": 2.540260736196319e-06, + "objective/entropy": 1.8503284454345703, + "objective/kl": 19.552143096923828, + "objective/non_score_reward": -1.9552143812179565, + "objective/rlhf_reward": -4.89713830196974, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 6.352148532867432, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6283992528915405, + "step": 2398, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00150990486145 + }, + { + "episode": 38400, + "epoch": 0.6902254017327534, + "loss/policy_avg": 0.30495020747184753, + "lr": 2.540069018404908e-06, + "objective/entropy": 225.76051330566406, + "objective/kl": 18.361190795898438, + "objective/non_score_reward": -1.8361191749572754, + "objective/rlhf_reward": -6.944476789236068, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.112527847290039, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5440366268157959, + "step": 2399, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987056255340576 + }, + { + "episode": 38416, + "epoch": 0.690512995650142, + "loss/policy_avg": -0.05993441119790077, + "lr": 2.539877300613497e-06, + "objective/entropy": -79.60920715332031, + "objective/kl": 7.422837257385254, + "objective/non_score_reward": -0.7422837018966675, + "objective/rlhf_reward": 1.4308651030063633, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.5689568519592285, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.5827436447143555, + "step": 2400, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.015185594558716 + }, + { + "episode": 38432, + "epoch": 0.6908005895675307, + "loss/policy_avg": 0.0555083267390728, + "lr": 2.539685582822086e-06, + "objective/entropy": 232.16439819335938, + "objective/kl": 17.389009475708008, + "objective/non_score_reward": -1.738900899887085, + "objective/rlhf_reward": -4.832897307650123, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 52.19811248779297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5548558235168457, + "step": 2401, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0012123584747314 + }, + { + "episode": 38448, + "epoch": 0.6910881834849193, + "loss/policy_avg": 0.05764458328485489, + "lr": 2.539493865030675e-06, + "objective/entropy": -11.375543594360352, + "objective/kl": 14.564275741577148, + "objective/non_score_reward": -1.4564275741577148, + "objective/rlhf_reward": -4.092377052704493, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 19.830394744873047, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4726178050041199, + "step": 2402, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997546672821045 + }, + { + "episode": 38464, + "epoch": 0.6913757774023079, + "loss/policy_avg": 0.6865073442459106, + "lr": 2.5393021472392638e-06, + "objective/entropy": 84.86620330810547, + "objective/kl": 13.430087089538574, + "objective/non_score_reward": -1.3430086374282837, + "objective/rlhf_reward": -7.372034072875977, + "objective/scores": -0.5, + "policy/approxkl_avg": 16.909427642822266, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4831317067146301, + "step": 2403, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9976210594177246 + }, + { + "episode": 38480, + "epoch": 0.6916633713196966, + "loss/policy_avg": 0.21324029564857483, + "lr": 2.539110429447853e-06, + "objective/entropy": 129.78118896484375, + "objective/kl": 19.45285415649414, + "objective/non_score_reward": -1.9452855587005615, + "objective/rlhf_reward": -7.3811422944068905, + "objective/scores": 0.1, + "policy/approxkl_avg": 32.29778289794922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5191063284873962, + "step": 2404, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0007407665252686 + }, + { + "episode": 38496, + "epoch": 0.6919509652370852, + "loss/policy_avg": 0.3362481892108917, + "lr": 2.538918711656442e-06, + "objective/entropy": -218.5269775390625, + "objective/kl": 19.677339553833008, + "objective/non_score_reward": -1.9677337408065796, + "objective/rlhf_reward": -3.4709349930286404, + "objective/scores": 1.1, + "policy/approxkl_avg": 140.0068359375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7181460857391357, + "step": 2405, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9982805252075195 + }, + { + "episode": 38512, + "epoch": 0.6922385591544739, + "loss/policy_avg": 0.5659635663032532, + "lr": 2.5387269938650306e-06, + "objective/entropy": 295.00103759765625, + "objective/kl": 30.044387817382812, + "objective/non_score_reward": -3.004439353942871, + "objective/rlhf_reward": -11.617756938934328, + "objective/scores": 0.1, + "policy/approxkl_avg": 239.81732177734375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7165085673332214, + "step": 2406, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972922801971436 + }, + { + "episode": 38528, + "epoch": 0.6925261530718625, + "loss/policy_avg": 0.27810075879096985, + "lr": 2.53853527607362e-06, + "objective/entropy": -57.76658630371094, + "objective/kl": 13.402792930603027, + "objective/non_score_reward": -1.3402793407440186, + "objective/rlhf_reward": -4.961117750406265, + "objective/scores": 0.1, + "policy/approxkl_avg": 6.219494819641113, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4802490174770355, + "step": 2407, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982696771621704 + }, + { + "episode": 38544, + "epoch": 0.6928137469892511, + "loss/policy_avg": 0.7481805086135864, + "lr": 2.5383435582822087e-06, + "objective/entropy": 66.06964111328125, + "objective/kl": 18.523412704467773, + "objective/non_score_reward": -1.8523410558700562, + "objective/rlhf_reward": -5.286657991186653, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 39.415714263916016, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.3947157859802246, + "step": 2408, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989726543426514 + }, + { + "episode": 38560, + "epoch": 0.6931013409066398, + "loss/policy_avg": 0.0016599632799625397, + "lr": 2.538151840490798e-06, + "objective/entropy": -116.66150665283203, + "objective/kl": 6.039176940917969, + "objective/non_score_reward": -0.60391765832901, + "objective/rlhf_reward": -0.29296445317653186, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 1.3719651699066162, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7192214727401733, + "step": 2409, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000486135482788 + }, + { + "episode": 38576, + "epoch": 0.6933889348240285, + "loss/policy_avg": 0.3459470272064209, + "lr": 2.5379601226993867e-06, + "objective/entropy": -75.4835205078125, + "objective/kl": 15.965251922607422, + "objective/non_score_reward": -1.5965251922607422, + "objective/rlhf_reward": -3.986101126670837, + "objective/scores": 0.6, + "policy/approxkl_avg": 52.870975494384766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7970390319824219, + "step": 2410, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9975183010101318 + }, + { + "episode": 38592, + "epoch": 0.6936765287414172, + "loss/policy_avg": 0.4983133375644684, + "lr": 2.5377684049079755e-06, + "objective/entropy": 6.6052703857421875, + "objective/kl": 8.632669448852539, + "objective/non_score_reward": -0.8632669448852539, + "objective/rlhf_reward": -3.0530677199363705, + "objective/scores": 0.1, + "policy/approxkl_avg": 4.3412885665893555, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.35764527320861816, + "step": 2411, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9993387460708618 + }, + { + "episode": 38608, + "epoch": 0.6939641226588058, + "loss/policy_avg": 0.1209329292178154, + "lr": 2.5375766871165643e-06, + "objective/entropy": 3.1751174926757812, + "objective/kl": 15.329707145690918, + "objective/non_score_reward": -1.532970666885376, + "objective/rlhf_reward": -3.7318827271461483, + "objective/scores": 0.6, + "policy/approxkl_avg": 17.169780731201172, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5270591974258423, + "step": 2412, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985158443450928 + }, + { + "episode": 38624, + "epoch": 0.6942517165761944, + "loss/policy_avg": 0.38980090618133545, + "lr": 2.537384969325153e-06, + "objective/entropy": 288.11083984375, + "objective/kl": 23.299327850341797, + "objective/non_score_reward": -2.3299331665039062, + "objective/rlhf_reward": -11.319732666015625, + "objective/scores": -0.5, + "policy/approxkl_avg": 58.987186431884766, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7453069686889648, + "step": 2413, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0009350776672363 + }, + { + "episode": 38640, + "epoch": 0.6945393104935831, + "loss/policy_avg": 0.6178219318389893, + "lr": 2.5371932515337424e-06, + "objective/entropy": 159.4707489013672, + "objective/kl": 17.919052124023438, + "objective/non_score_reward": -1.791905403137207, + "objective/rlhf_reward": -6.767621493339538, + "objective/scores": 0.1, + "policy/approxkl_avg": 15.933883666992188, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7238528728485107, + "step": 2414, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987776279449463 + }, + { + "episode": 38656, + "epoch": 0.6948269044109717, + "loss/policy_avg": 0.05612446367740631, + "lr": 2.537001533742331e-06, + "objective/entropy": -43.256649017333984, + "objective/kl": 16.837635040283203, + "objective/non_score_reward": -1.6837635040283203, + "objective/rlhf_reward": -2.3350535988807675, + "objective/scores": 1.1, + "policy/approxkl_avg": 38.66826629638672, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5314440131187439, + "step": 2415, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0007174015045166 + }, + { + "episode": 38672, + "epoch": 0.6951144983283604, + "loss/policy_avg": 0.31168970465660095, + "lr": 2.5368098159509204e-06, + "objective/entropy": 250.38308715820312, + "objective/kl": 22.91397476196289, + "objective/non_score_reward": -2.2913975715637207, + "objective/rlhf_reward": -11.165590286254883, + "objective/scores": -0.5, + "policy/approxkl_avg": 56.341278076171875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6258459687232971, + "step": 2416, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974250793457031 + }, + { + "episode": 38688, + "epoch": 0.695402092245749, + "loss/policy_avg": -0.04682635888457298, + "lr": 2.5366180981595092e-06, + "objective/entropy": -6.0383758544921875, + "objective/kl": 13.405521392822266, + "objective/non_score_reward": -1.3405520915985107, + "objective/rlhf_reward": -3.8059493293433935, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 8.999337196350098, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7096725702285767, + "step": 2417, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0013351440429688 + }, + { + "episode": 38704, + "epoch": 0.6956896861631376, + "loss/policy_avg": 0.0935414507985115, + "lr": 2.536426380368098e-06, + "objective/entropy": -0.039379119873046875, + "objective/kl": 16.317344665527344, + "objective/non_score_reward": -1.6317346096038818, + "objective/rlhf_reward": -4.579526851849492, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 2.4132003784179688, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5124933123588562, + "step": 2418, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999948501586914 + }, + { + "episode": 38720, + "epoch": 0.6959772800805263, + "loss/policy_avg": 0.11428681015968323, + "lr": 2.5362346625766873e-06, + "objective/entropy": 169.2416229248047, + "objective/kl": 14.574958801269531, + "objective/non_score_reward": -1.457495927810669, + "objective/rlhf_reward": -5.429983532428741, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.09098815917969, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8106528520584106, + "step": 2419, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0020008087158203 + }, + { + "episode": 38736, + "epoch": 0.6962648739979149, + "loss/policy_avg": 0.5395548343658447, + "lr": 2.536042944785276e-06, + "objective/entropy": 69.16426086425781, + "objective/kl": 16.80579948425293, + "objective/non_score_reward": -1.6805799007415771, + "objective/rlhf_reward": -8.722319602966309, + "objective/scores": -0.5, + "policy/approxkl_avg": 21.360477447509766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.624801516532898, + "step": 2420, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997470498085022 + }, + { + "episode": 38752, + "epoch": 0.6965524679153036, + "loss/policy_avg": 0.30645912885665894, + "lr": 2.535851226993865e-06, + "objective/entropy": -11.797195434570312, + "objective/kl": 17.944252014160156, + "objective/non_score_reward": -1.7944250106811523, + "objective/rlhf_reward": -6.7777004599571224, + "objective/scores": 0.1, + "policy/approxkl_avg": 14.293865203857422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4035222828388214, + "step": 2421, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990260601043701 + }, + { + "episode": 38768, + "epoch": 0.6968400618326922, + "loss/policy_avg": 0.5221757292747498, + "lr": 2.535659509202454e-06, + "objective/entropy": -88.67009735107422, + "objective/kl": 13.959169387817383, + "objective/non_score_reward": -1.3959168195724487, + "objective/rlhf_reward": -5.183667278289795, + "objective/scores": 0.1, + "policy/approxkl_avg": 63.5391845703125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.936865508556366, + "step": 2422, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9974651336669922 + }, + { + "episode": 38784, + "epoch": 0.6971276557500808, + "loss/policy_avg": 0.192846417427063, + "lr": 2.535467791411043e-06, + "objective/entropy": 76.54438781738281, + "objective/kl": 14.187356948852539, + "objective/non_score_reward": -1.4187356233596802, + "objective/rlhf_reward": -1.2749424934387203, + "objective/scores": 1.1, + "policy/approxkl_avg": 11.399131774902344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6197546720504761, + "step": 2423, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9990915060043335 + }, + { + "episode": 38800, + "epoch": 0.6974152496674695, + "loss/policy_avg": 0.09757228195667267, + "lr": 2.535276073619632e-06, + "objective/entropy": 9.487838745117188, + "objective/kl": 16.353984832763672, + "objective/non_score_reward": -1.6353983879089355, + "objective/rlhf_reward": -3.6178748949777813, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 48.739418029785156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6486073732376099, + "step": 2424, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970083236694336 + }, + { + "episode": 38816, + "epoch": 0.6977028435848582, + "loss/policy_avg": 0.020987726747989655, + "lr": 2.535084355828221e-06, + "objective/entropy": 76.13706970214844, + "objective/kl": 14.1550931930542, + "objective/non_score_reward": -1.4155092239379883, + "objective/rlhf_reward": -5.262037134170532, + "objective/scores": 0.1, + "policy/approxkl_avg": 8.276832580566406, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7149036526679993, + "step": 2425, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9996775388717651 + }, + { + "episode": 38832, + "epoch": 0.6979904375022469, + "loss/policy_avg": 0.04890640825033188, + "lr": 2.5348926380368098e-06, + "objective/entropy": 46.987709045410156, + "objective/kl": 14.668130874633789, + "objective/non_score_reward": -1.466813087463379, + "objective/rlhf_reward": -5.467252647876739, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.003772735595703, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.497058242559433, + "step": 2426, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9984325170516968 + }, + { + "episode": 38848, + "epoch": 0.6982780314196355, + "loss/policy_avg": -0.045340899378061295, + "lr": 2.534700920245399e-06, + "objective/entropy": -37.19403076171875, + "objective/kl": 12.195722579956055, + "objective/non_score_reward": -1.2195723056793213, + "objective/rlhf_reward": -1.954570126475069, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 27.273773193359375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7038757801055908, + "step": 2427, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9979445934295654 + }, + { + "episode": 38864, + "epoch": 0.6985656253370242, + "loss/policy_avg": 0.17511284351348877, + "lr": 2.534509202453988e-06, + "objective/entropy": 11.486076354980469, + "objective/kl": 22.701976776123047, + "objective/non_score_reward": -2.270197868347168, + "objective/rlhf_reward": -4.680790758132934, + "objective/scores": 1.1, + "policy/approxkl_avg": 59.19794464111328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6279011964797974, + "step": 2428, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9995098114013672 + }, + { + "episode": 38880, + "epoch": 0.6988532192544128, + "loss/policy_avg": 0.18574991822242737, + "lr": 2.534317484662577e-06, + "objective/entropy": -54.96965026855469, + "objective/kl": 19.47069549560547, + "objective/non_score_reward": -1.947069525718689, + "objective/rlhf_reward": -3.388278222084045, + "objective/scores": 1.1, + "policy/approxkl_avg": 38.52406311035156, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5313050746917725, + "step": 2429, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999105453491211 + }, + { + "episode": 38896, + "epoch": 0.6991408131718014, + "loss/policy_avg": 1.562233805656433, + "lr": 2.534125766871166e-06, + "objective/entropy": 67.32986450195312, + "objective/kl": 21.02432632446289, + "objective/non_score_reward": -2.1024327278137207, + "objective/rlhf_reward": -8.009731090068817, + "objective/scores": 0.1, + "policy/approxkl_avg": 72.18418884277344, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7826734185218811, + "step": 2430, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9975332021713257 + }, + { + "episode": 38912, + "epoch": 0.6994284070891901, + "loss/policy_avg": -0.02676713466644287, + "lr": 2.5339340490797547e-06, + "objective/entropy": -116.99898529052734, + "objective/kl": 16.28338623046875, + "objective/non_score_reward": -1.6283388137817383, + "objective/rlhf_reward": -8.513355255126953, + "objective/scores": -0.5, + "policy/approxkl_avg": 123.47506713867188, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8290122747421265, + "step": 2431, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9993462562561035 + }, + { + "episode": 38928, + "epoch": 0.6997160010065787, + "loss/policy_avg": 0.010940248146653175, + "lr": 2.533742331288344e-06, + "objective/entropy": 219.49395751953125, + "objective/kl": 13.107633590698242, + "objective/non_score_reward": -1.3107634782791138, + "objective/rlhf_reward": -7.243053913116455, + "objective/scores": -0.5, + "policy/approxkl_avg": 16.813209533691406, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6526888608932495, + "step": 2432, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998497724533081 + }, + { + "episode": 38944, + "epoch": 0.7000035949239674, + "loss/policy_avg": 0.136129692196846, + "lr": 2.5335506134969327e-06, + "objective/entropy": -105.5390853881836, + "objective/kl": 18.09296226501465, + "objective/non_score_reward": -1.8092962503433228, + "objective/rlhf_reward": -4.313465987087461, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 12.864763259887695, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8648643493652344, + "step": 2433, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0009799003601074 + }, + { + "episode": 38960, + "epoch": 0.700291188841356, + "loss/policy_avg": -0.19945435225963593, + "lr": 2.5333588957055215e-06, + "objective/entropy": 113.15518188476562, + "objective/kl": 9.712821006774902, + "objective/non_score_reward": -0.9712820649147034, + "objective/rlhf_reward": 0.5148716807365421, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.9770641326904297, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.40460115671157837, + "step": 2434, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0024404525756836 + }, + { + "episode": 38976, + "epoch": 0.7005787827587446, + "loss/policy_avg": 0.0931832492351532, + "lr": 2.5331671779141103e-06, + "objective/entropy": 114.05369567871094, + "objective/kl": 15.93886947631836, + "objective/non_score_reward": -1.5938870906829834, + "objective/rlhf_reward": -8.375548362731934, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.6255927085876465, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5873702764511108, + "step": 2435, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991543292999268 + }, + { + "episode": 38992, + "epoch": 0.7008663766761333, + "loss/policy_avg": 0.5738829970359802, + "lr": 2.532975460122699e-06, + "objective/entropy": 156.0251922607422, + "objective/kl": 16.723852157592773, + "objective/non_score_reward": -1.6723852157592773, + "objective/rlhf_reward": -6.289540684223175, + "objective/scores": 0.1, + "policy/approxkl_avg": 28.836315155029297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8467593789100647, + "step": 2436, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9972467422485352 + }, + { + "episode": 39008, + "epoch": 0.7011539705935219, + "loss/policy_avg": 0.0672391876578331, + "lr": 2.5327837423312884e-06, + "objective/entropy": 65.33531188964844, + "objective/kl": 8.784222602844238, + "objective/non_score_reward": -0.8784223198890686, + "objective/rlhf_reward": -3.1136892795562745, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.5697336196899414, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5480427742004395, + "step": 2437, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989595413208008 + }, + { + "episode": 39024, + "epoch": 0.7014415645109106, + "loss/policy_avg": -0.06193147599697113, + "lr": 2.532592024539877e-06, + "objective/entropy": 282.8427429199219, + "objective/kl": 23.06869888305664, + "objective/non_score_reward": -2.3068699836730957, + "objective/rlhf_reward": -4.827480456233024, + "objective/scores": 1.1, + "policy/approxkl_avg": 47.57867431640625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7043386697769165, + "step": 2438, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004873275756836 + }, + { + "episode": 39040, + "epoch": 0.7017291584282992, + "loss/policy_avg": 0.16966520249843597, + "lr": 2.5324003067484664e-06, + "objective/entropy": -119.66554260253906, + "objective/kl": 20.203125, + "objective/non_score_reward": -2.020312786102295, + "objective/rlhf_reward": -7.681250667572021, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.11670684814453, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5933494567871094, + "step": 2439, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9998090267181396 + }, + { + "episode": 39056, + "epoch": 0.7020167523456878, + "loss/policy_avg": 0.4376192092895508, + "lr": 2.5322085889570552e-06, + "objective/entropy": -136.1998291015625, + "objective/kl": 14.186405181884766, + "objective/non_score_reward": -1.4186406135559082, + "objective/rlhf_reward": -3.5518559537091594, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 56.663612365722656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4070212244987488, + "step": 2440, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9971537590026855 + }, + { + "episode": 39072, + "epoch": 0.7023043462630766, + "loss/policy_avg": 0.12608441710472107, + "lr": 2.532016871165644e-06, + "objective/entropy": 134.5458984375, + "objective/kl": 12.70638370513916, + "objective/non_score_reward": -1.2706384658813477, + "objective/rlhf_reward": -4.682553744316101, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.481170654296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5888222455978394, + "step": 2441, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981567859649658 + }, + { + "episode": 39088, + "epoch": 0.7025919401804652, + "loss/policy_avg": 0.1018877923488617, + "lr": 2.5318251533742333e-06, + "objective/entropy": -259.6712646484375, + "objective/kl": 14.526229858398438, + "objective/non_score_reward": -1.452622890472412, + "objective/rlhf_reward": -7.810491561889648, + "objective/scores": -0.5, + "policy/approxkl_avg": 26.93299102783203, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5898329615592957, + "step": 2442, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9982686042785645 + }, + { + "episode": 39104, + "epoch": 0.7028795340978539, + "loss/policy_avg": 0.6017838716506958, + "lr": 2.531633435582822e-06, + "objective/entropy": -28.566848754882812, + "objective/kl": 14.843305587768555, + "objective/non_score_reward": -1.484330654144287, + "objective/rlhf_reward": -7.937322616577148, + "objective/scores": -0.5, + "policy/approxkl_avg": 7.366446495056152, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6063733696937561, + "step": 2443, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9991109371185303 + }, + { + "episode": 39120, + "epoch": 0.7031671280152425, + "loss/policy_avg": 0.20822694897651672, + "lr": 2.531441717791411e-06, + "objective/entropy": 80.4629135131836, + "objective/kl": 16.56536865234375, + "objective/non_score_reward": -1.6565370559692383, + "objective/rlhf_reward": -3.7024291499864788, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.534392356872559, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7806668281555176, + "step": 2444, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9979422092437744 + }, + { + "episode": 39136, + "epoch": 0.7034547219326311, + "loss/policy_avg": 0.1813095510005951, + "lr": 2.53125e-06, + "objective/entropy": 131.69081115722656, + "objective/kl": 13.948923110961914, + "objective/non_score_reward": -1.3948922157287598, + "objective/rlhf_reward": -1.1795689821243283, + "objective/scores": 1.1, + "policy/approxkl_avg": 9.80681037902832, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8091514110565186, + "step": 2445, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998410701751709 + }, + { + "episode": 39152, + "epoch": 0.7037423158500198, + "loss/policy_avg": 0.2942613363265991, + "lr": 2.531058282208589e-06, + "objective/entropy": 230.11111450195312, + "objective/kl": 16.492549896240234, + "objective/non_score_reward": -1.6492549180984497, + "objective/rlhf_reward": -6.197019553184509, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.10110473632812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5611556768417358, + "step": 2446, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000793933868408 + }, + { + "episode": 39168, + "epoch": 0.7040299097674084, + "loss/policy_avg": -0.0017958208918571472, + "lr": 2.530866564417178e-06, + "objective/entropy": -88.00894165039062, + "objective/kl": 13.98267936706543, + "objective/non_score_reward": -1.3982679843902588, + "objective/rlhf_reward": -7.593071937561035, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.794038772583008, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.884827733039856, + "step": 2447, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.001452922821045 + }, + { + "episode": 39184, + "epoch": 0.7043175036847971, + "loss/policy_avg": 0.7869254946708679, + "lr": 2.530674846625767e-06, + "objective/entropy": 63.617061614990234, + "objective/kl": 20.715608596801758, + "objective/non_score_reward": -2.0715606212615967, + "objective/rlhf_reward": -5.8862427234649655, + "objective/scores": 0.6, + "policy/approxkl_avg": 187.70083618164062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8969687223434448, + "step": 2448, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9987365007400513 + }, + { + "episode": 39200, + "epoch": 0.7046050976021857, + "loss/policy_avg": 0.31531211733818054, + "lr": 2.5304831288343558e-06, + "objective/entropy": 15.844051361083984, + "objective/kl": 20.274208068847656, + "objective/non_score_reward": -2.027420997619629, + "objective/rlhf_reward": -10.109683990478516, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.42197036743164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8046696186065674, + "step": 2449, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989368915557861 + }, + { + "episode": 39216, + "epoch": 0.7048926915195743, + "loss/policy_avg": 0.29321062564849854, + "lr": 2.530291411042945e-06, + "objective/entropy": 217.91275024414062, + "objective/kl": 14.801872253417969, + "objective/non_score_reward": -1.4801872968673706, + "objective/rlhf_reward": -7.920749187469482, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.509456634521484, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7837471961975098, + "step": 2450, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977532625198364 + }, + { + "episode": 39232, + "epoch": 0.705180285436963, + "loss/policy_avg": 0.04437948763370514, + "lr": 2.530099693251534e-06, + "objective/entropy": 42.03483581542969, + "objective/kl": 12.025967597961426, + "objective/non_score_reward": -1.20259690284729, + "objective/rlhf_reward": -6.81038761138916, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.049354553222656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658707320690155, + "step": 2451, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000619888305664 + }, + { + "episode": 39248, + "epoch": 0.7054678793543516, + "loss/policy_avg": -0.3899479806423187, + "lr": 2.529907975460123e-06, + "objective/entropy": 134.8405303955078, + "objective/kl": 13.88528823852539, + "objective/non_score_reward": -1.388528823852539, + "objective/rlhf_reward": -1.1541152358055111, + "objective/scores": 1.1, + "policy/approxkl_avg": 38.11298370361328, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6311709880828857, + "step": 2452, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998467206954956 + }, + { + "episode": 39264, + "epoch": 0.7057554732717403, + "loss/policy_avg": 0.1815769076347351, + "lr": 2.529716257668712e-06, + "objective/entropy": 17.50238037109375, + "objective/kl": 15.831437110900879, + "objective/non_score_reward": -1.583143711090088, + "objective/rlhf_reward": -8.332574844360352, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.159279823303223, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5247014760971069, + "step": 2453, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9976840019226074 + }, + { + "episode": 39280, + "epoch": 0.7060430671891289, + "loss/policy_avg": 0.4368734359741211, + "lr": 2.5295245398773007e-06, + "objective/entropy": 74.67704772949219, + "objective/kl": 18.40035057067871, + "objective/non_score_reward": -1.840035080909729, + "objective/rlhf_reward": -5.237434091345344, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 21.29029083251953, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6295360326766968, + "step": 2454, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989995956420898 + }, + { + "episode": 39296, + "epoch": 0.7063306611065175, + "loss/policy_avg": 0.03266135975718498, + "lr": 2.52933282208589e-06, + "objective/entropy": 137.1201934814453, + "objective/kl": 10.690437316894531, + "objective/non_score_reward": -1.0690436363220215, + "objective/rlhf_reward": -1.8761746942996977, + "objective/scores": 0.6, + "policy/approxkl_avg": 24.7198429107666, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4544951319694519, + "step": 2455, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994776248931885 + }, + { + "episode": 39312, + "epoch": 0.7066182550239063, + "loss/policy_avg": 1.1425162553787231, + "lr": 2.5291411042944787e-06, + "objective/entropy": 49.3416862487793, + "objective/kl": 13.393396377563477, + "objective/non_score_reward": -1.3393394947052002, + "objective/rlhf_reward": -7.357357501983643, + "objective/scores": -0.5, + "policy/approxkl_avg": 1.4734693765640259, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.6386362910270691, + "step": 2456, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000995635986328 + }, + { + "episode": 39328, + "epoch": 0.7069058489412949, + "loss/policy_avg": 0.3333868980407715, + "lr": 2.5289493865030675e-06, + "objective/entropy": 180.9315948486328, + "objective/kl": 20.428478240966797, + "objective/non_score_reward": -2.0428476333618164, + "objective/rlhf_reward": -5.247671757580015, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 88.2296371459961, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5422890186309814, + "step": 2457, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996193766593933 + }, + { + "episode": 39344, + "epoch": 0.7071934428586836, + "loss/policy_avg": 0.41165536642074585, + "lr": 2.5287576687116563e-06, + "objective/entropy": -129.3858184814453, + "objective/kl": 15.993926048278809, + "objective/non_score_reward": -1.5993927717208862, + "objective/rlhf_reward": -1.9975711464881893, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.37614631652832, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4606460928916931, + "step": 2458, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.997300386428833 + }, + { + "episode": 39360, + "epoch": 0.7074810367760722, + "loss/policy_avg": -0.09240938723087311, + "lr": 2.528565950920245e-06, + "objective/entropy": 91.71644592285156, + "objective/kl": 17.114168167114258, + "objective/non_score_reward": -1.7114169597625732, + "objective/rlhf_reward": -5.020839090618203, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 0.605006217956543, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7479628324508667, + "step": 2459, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001898765563965 + }, + { + "episode": 39376, + "epoch": 0.7077686306934609, + "loss/policy_avg": 0.4263072907924652, + "lr": 2.5283742331288344e-06, + "objective/entropy": 52.519737243652344, + "objective/kl": 12.139659881591797, + "objective/non_score_reward": -1.2139661312103271, + "objective/rlhf_reward": -6.855864524841309, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.710725784301758, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6820878982543945, + "step": 2460, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9982566833496094 + }, + { + "episode": 39392, + "epoch": 0.7080562246108495, + "loss/policy_avg": -0.23309458792209625, + "lr": 2.528182515337423e-06, + "objective/entropy": 82.60530853271484, + "objective/kl": 13.121841430664062, + "objective/non_score_reward": -1.3121840953826904, + "objective/rlhf_reward": -7.248736381530762, + "objective/scores": -0.5, + "policy/approxkl_avg": 14.2933349609375, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6222156286239624, + "step": 2461, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0058553218841553 + }, + { + "episode": 39408, + "epoch": 0.7083438185282381, + "loss/policy_avg": 0.5179277658462524, + "lr": 2.5279907975460124e-06, + "objective/entropy": 95.78985595703125, + "objective/kl": 11.426376342773438, + "objective/non_score_reward": -1.1426376104354858, + "objective/rlhf_reward": -4.1705506503582, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.478578090667725, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5440711975097656, + "step": 2462, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9992921352386475 + }, + { + "episode": 39424, + "epoch": 0.7086314124456268, + "loss/policy_avg": 0.42213958501815796, + "lr": 2.5277990797546012e-06, + "objective/entropy": 62.700016021728516, + "objective/kl": 11.761016845703125, + "objective/non_score_reward": -1.1761016845703125, + "objective/rlhf_reward": -4.304406976699829, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.750205993652344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44627535343170166, + "step": 2463, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0002737045288086 + }, + { + "episode": 39440, + "epoch": 0.7089190063630154, + "loss/policy_avg": 2.15847110748291, + "lr": 2.52760736196319e-06, + "objective/entropy": -23.891254425048828, + "objective/kl": 17.260528564453125, + "objective/non_score_reward": -1.7260527610778809, + "objective/rlhf_reward": -6.504211223125457, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.24324035644531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6024461984634399, + "step": 2464, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0029826164245605 + }, + { + "episode": 39456, + "epoch": 0.709206600280404, + "loss/policy_avg": 0.08471131324768066, + "lr": 2.5274156441717793e-06, + "objective/entropy": 52.2718505859375, + "objective/kl": 14.018486022949219, + "objective/non_score_reward": -1.401848554611206, + "objective/rlhf_reward": -1.2073942780494686, + "objective/scores": 1.1, + "policy/approxkl_avg": 28.739349365234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8441653251647949, + "step": 2465, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9985237121582031 + }, + { + "episode": 39472, + "epoch": 0.7094941941977927, + "loss/policy_avg": 1.2592285871505737, + "lr": 2.527223926380368e-06, + "objective/entropy": -151.93869018554688, + "objective/kl": 5.660920143127441, + "objective/non_score_reward": -0.5660920143127441, + "objective/rlhf_reward": -0.7081087817817475, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 1.2069416046142578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7391703128814697, + "step": 2466, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 2.0010972023010254 + }, + { + "episode": 39488, + "epoch": 0.7097817881151813, + "loss/policy_avg": 0.3982200026512146, + "lr": 2.5270322085889573e-06, + "objective/entropy": 120.27207946777344, + "objective/kl": 14.920924186706543, + "objective/non_score_reward": -1.4920926094055176, + "objective/rlhf_reward": -3.0446511849176616, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 3.3740313053131104, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6713103652000427, + "step": 2467, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0001144409179688 + }, + { + "episode": 39504, + "epoch": 0.71006938203257, + "loss/policy_avg": 0.24763520061969757, + "lr": 2.526840490797546e-06, + "objective/entropy": 201.60816955566406, + "objective/kl": 21.655441284179688, + "objective/non_score_reward": -2.165544033050537, + "objective/rlhf_reward": -8.262176132202148, + "objective/scores": 0.1, + "policy/approxkl_avg": 79.32552337646484, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5143510103225708, + "step": 2468, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986385107040405 + }, + { + "episode": 39520, + "epoch": 0.7103569759499586, + "loss/policy_avg": 0.3981194496154785, + "lr": 2.526648773006135e-06, + "objective/entropy": 122.44503784179688, + "objective/kl": 14.38865852355957, + "objective/non_score_reward": -1.4388657808303833, + "objective/rlhf_reward": -5.3554630935192105, + "objective/scores": 0.1, + "policy/approxkl_avg": 19.020545959472656, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.46620768308639526, + "step": 2469, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975099563598633 + }, + { + "episode": 39536, + "epoch": 0.7106445698673473, + "loss/policy_avg": -0.22603286802768707, + "lr": 2.526457055214724e-06, + "objective/entropy": -205.59811401367188, + "objective/kl": 9.54704475402832, + "objective/non_score_reward": -0.9547045826911926, + "objective/rlhf_reward": -3.41881833076477, + "objective/scores": 0.1, + "policy/approxkl_avg": 12.569239616394043, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5243770480155945, + "step": 2470, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0026748180389404 + }, + { + "episode": 39552, + "epoch": 0.710932163784736, + "loss/policy_avg": 0.17075514793395996, + "lr": 2.526265337423313e-06, + "objective/entropy": -8.069732666015625, + "objective/kl": 14.373933792114258, + "objective/non_score_reward": -1.4373931884765625, + "objective/rlhf_reward": -1.3495728135108944, + "objective/scores": 1.1, + "policy/approxkl_avg": 74.23524475097656, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5825380086898804, + "step": 2471, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9990153312683105 + }, + { + "episode": 39568, + "epoch": 0.7112197577021246, + "loss/policy_avg": 0.23406949639320374, + "lr": 2.5260736196319018e-06, + "objective/entropy": -230.138916015625, + "objective/kl": 13.352453231811523, + "objective/non_score_reward": -1.3352453708648682, + "objective/rlhf_reward": -4.940981125831604, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.360355377197266, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.656883955001831, + "step": 2472, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9998586177825928 + }, + { + "episode": 39584, + "epoch": 0.7115073516195133, + "loss/policy_avg": -0.256904661655426, + "lr": 2.525881901840491e-06, + "objective/entropy": -5.667182922363281, + "objective/kl": 13.477766990661621, + "objective/non_score_reward": -1.3477768898010254, + "objective/rlhf_reward": -7.391107559204102, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.0692596435546875, + "policy/clipfrac_avg": 1.75, + "policy/entropy_avg": 0.8383212089538574, + "step": 2473, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0026912689208984 + }, + { + "episode": 39600, + "epoch": 0.7117949455369019, + "loss/policy_avg": 0.03103642165660858, + "lr": 2.52569018404908e-06, + "objective/entropy": -117.89700317382812, + "objective/kl": 3.5854506492614746, + "objective/non_score_reward": -0.35854509472846985, + "objective/rlhf_reward": 0.299152954419454, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 1.3619961738586426, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5186189413070679, + "step": 2474, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9998805522918701 + }, + { + "episode": 39616, + "epoch": 0.7120825394542906, + "loss/policy_avg": 0.02902698889374733, + "lr": 2.525498466257669e-06, + "objective/entropy": 196.0189666748047, + "objective/kl": 11.46353530883789, + "objective/non_score_reward": -1.1463537216186523, + "objective/rlhf_reward": -2.638003478722508, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 6.187473773956299, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8688172101974487, + "step": 2475, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0030550956726074 + }, + { + "episode": 39632, + "epoch": 0.7123701333716792, + "loss/policy_avg": 0.3417108952999115, + "lr": 2.525306748466258e-06, + "objective/entropy": -89.01301574707031, + "objective/kl": 15.506939888000488, + "objective/non_score_reward": -1.5506939888000488, + "objective/rlhf_reward": -5.802776134014129, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.099979400634766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.61147540807724, + "step": 2476, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 9, + "val/ratio": 1.996544361114502 + }, + { + "episode": 39648, + "epoch": 0.7126577272890678, + "loss/policy_avg": 1.3282965421676636, + "lr": 2.5251150306748467e-06, + "objective/entropy": -183.356689453125, + "objective/kl": 20.318004608154297, + "objective/non_score_reward": -2.0318005084991455, + "objective/rlhf_reward": -7.727202153205871, + "objective/scores": 0.1, + "policy/approxkl_avg": 23.901187896728516, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6787353754043579, + "step": 2477, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.999013900756836 + }, + { + "episode": 39664, + "epoch": 0.7129453212064565, + "loss/policy_avg": -0.1604928970336914, + "lr": 2.524923312883436e-06, + "objective/entropy": -99.90774536132812, + "objective/kl": 10.385431289672852, + "objective/non_score_reward": -1.0385432243347168, + "objective/rlhf_reward": -6.154172897338867, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.267227172851562, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6196037530899048, + "step": 2478, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.003584384918213 + }, + { + "episode": 39680, + "epoch": 0.7132329151238451, + "loss/policy_avg": 0.06962661445140839, + "lr": 2.5247315950920243e-06, + "objective/entropy": -21.934837341308594, + "objective/kl": 20.159160614013672, + "objective/non_score_reward": -2.015915870666504, + "objective/rlhf_reward": -3.6636640191078182, + "objective/scores": 1.1, + "policy/approxkl_avg": 38.49924850463867, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5127847194671631, + "step": 2479, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.998596429824829 + }, + { + "episode": 39696, + "epoch": 0.7135205090412338, + "loss/policy_avg": 0.223766028881073, + "lr": 2.5245398773006135e-06, + "objective/entropy": 161.35714721679688, + "objective/kl": 16.0443115234375, + "objective/non_score_reward": -1.604431390762329, + "objective/rlhf_reward": -4.2950192711511, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.8752920627593994, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6941061019897461, + "step": 2480, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995613098144531 + }, + { + "episode": 39712, + "epoch": 0.7138081029586224, + "loss/policy_avg": 0.16352665424346924, + "lr": 2.5243481595092023e-06, + "objective/entropy": -47.38682556152344, + "objective/kl": 22.613080978393555, + "objective/non_score_reward": -2.261308193206787, + "objective/rlhf_reward": -8.64523241519928, + "objective/scores": 0.1, + "policy/approxkl_avg": 40.13688659667969, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6305144429206848, + "step": 2481, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9983210563659668 + }, + { + "episode": 39728, + "epoch": 0.714095696876011, + "loss/policy_avg": -0.052701499313116074, + "lr": 2.5241564417177916e-06, + "objective/entropy": -126.26377868652344, + "objective/kl": 15.999542236328125, + "objective/non_score_reward": -1.59995436668396, + "objective/rlhf_reward": -5.9998174667358395, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.10373306274414, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6740762591362, + "step": 2482, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999603509902954 + }, + { + "episode": 39744, + "epoch": 0.7143832907933997, + "loss/policy_avg": 0.2014961838722229, + "lr": 2.5239647239263804e-06, + "objective/entropy": 119.9909439086914, + "objective/kl": 16.598087310791016, + "objective/non_score_reward": -1.6598087549209595, + "objective/rlhf_reward": -8.63923454284668, + "objective/scores": -0.5, + "policy/approxkl_avg": 87.78462219238281, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6382485628128052, + "step": 2483, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000264644622803 + }, + { + "episode": 39760, + "epoch": 0.7146708847107883, + "loss/policy_avg": 0.000359182246029377, + "lr": 2.523773006134969e-06, + "objective/entropy": -233.36184692382812, + "objective/kl": 14.149672508239746, + "objective/non_score_reward": -1.4149672985076904, + "objective/rlhf_reward": -4.055749181572514, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 56.9659423828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4874529242515564, + "step": 2484, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.000154733657837 + }, + { + "episode": 39776, + "epoch": 0.714958478628177, + "loss/policy_avg": 0.020351115614175797, + "lr": 2.5235812883435584e-06, + "objective/entropy": 63.841209411621094, + "objective/kl": 13.909393310546875, + "objective/non_score_reward": -1.3909393548965454, + "objective/rlhf_reward": -3.6163459522294357, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 17.216432571411133, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5677775144577026, + "step": 2485, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9991185665130615 + }, + { + "episode": 39792, + "epoch": 0.7152460725455657, + "loss/policy_avg": 0.06903448700904846, + "lr": 2.5233895705521472e-06, + "objective/entropy": -20.867111206054688, + "objective/kl": 16.73269271850586, + "objective/non_score_reward": -1.673269271850586, + "objective/rlhf_reward": -8.693077087402344, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.240337371826172, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6185452938079834, + "step": 2486, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9984816312789917 + }, + { + "episode": 39808, + "epoch": 0.7155336664629544, + "loss/policy_avg": 0.5605267286300659, + "lr": 2.523197852760736e-06, + "objective/entropy": 114.54639434814453, + "objective/kl": 15.872806549072266, + "objective/non_score_reward": -1.587280511856079, + "objective/rlhf_reward": -3.4254033907663555, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 45.90546417236328, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5481705665588379, + "step": 2487, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987268447875977 + }, + { + "episode": 39824, + "epoch": 0.715821260380343, + "loss/policy_avg": 0.09952209889888763, + "lr": 2.5230061349693253e-06, + "objective/entropy": 108.37059783935547, + "objective/kl": 19.014339447021484, + "objective/non_score_reward": -1.9014341831207275, + "objective/rlhf_reward": -6.001616511408406, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 23.715465545654297, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5643784999847412, + "step": 2488, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0007777214050293 + }, + { + "episode": 39840, + "epoch": 0.7161088542977316, + "loss/policy_avg": 0.012271733954548836, + "lr": 2.522814417177914e-06, + "objective/entropy": -56.91749954223633, + "objective/kl": 11.618297576904297, + "objective/non_score_reward": -1.1618297100067139, + "objective/rlhf_reward": -3.0431989765802197, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 6.499038219451904, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5179617404937744, + "step": 2489, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9994232654571533 + }, + { + "episode": 39856, + "epoch": 0.7163964482151203, + "loss/policy_avg": 0.26141178607940674, + "lr": 2.5226226993865033e-06, + "objective/entropy": -87.80034637451172, + "objective/kl": 12.487592697143555, + "objective/non_score_reward": -1.2487592697143555, + "objective/rlhf_reward": -4.595037287473678, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.373741149902344, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.8327517509460449, + "step": 2490, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0007476806640625 + }, + { + "episode": 39872, + "epoch": 0.7166840421325089, + "loss/policy_avg": 0.3865785002708435, + "lr": 2.522430981595092e-06, + "objective/entropy": 72.23921966552734, + "objective/kl": 18.066194534301758, + "objective/non_score_reward": -1.8066197633743286, + "objective/rlhf_reward": -9.226478576660156, + "objective/scores": -0.5, + "policy/approxkl_avg": 31.11423110961914, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.514935314655304, + "step": 2491, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987236261367798 + }, + { + "episode": 39888, + "epoch": 0.7169716360498976, + "loss/policy_avg": 0.48632141947746277, + "lr": 2.522239263803681e-06, + "objective/entropy": 53.31781005859375, + "objective/kl": 21.280006408691406, + "objective/non_score_reward": -2.1280007362365723, + "objective/rlhf_reward": -10.512002944946289, + "objective/scores": -0.5, + "policy/approxkl_avg": 17.73607635498047, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.605411946773529, + "step": 2492, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9973607063293457 + }, + { + "episode": 39904, + "epoch": 0.7172592299672862, + "loss/policy_avg": 0.5800670385360718, + "lr": 2.52204754601227e-06, + "objective/entropy": 54.6729621887207, + "objective/kl": 20.627490997314453, + "objective/non_score_reward": -2.0627493858337402, + "objective/rlhf_reward": -3.8509970664978024, + "objective/scores": 1.1, + "policy/approxkl_avg": 43.98124694824219, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.40592074394226074, + "step": 2493, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9984729290008545 + }, + { + "episode": 39920, + "epoch": 0.7175468238846748, + "loss/policy_avg": 0.03092500939965248, + "lr": 2.521855828220859e-06, + "objective/entropy": 122.41065216064453, + "objective/kl": 14.277397155761719, + "objective/non_score_reward": -1.4277398586273193, + "objective/rlhf_reward": -5.310959136486053, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.55243492126465, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6134049296379089, + "step": 2494, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998281717300415 + }, + { + "episode": 39936, + "epoch": 0.7178344178020635, + "loss/policy_avg": 0.4120306372642517, + "lr": 2.521664110429448e-06, + "objective/entropy": 88.1956787109375, + "objective/kl": 12.321871757507324, + "objective/non_score_reward": -1.232187271118164, + "objective/rlhf_reward": -6.928748607635498, + "objective/scores": -0.5, + "policy/approxkl_avg": 29.983295440673828, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5779134035110474, + "step": 2495, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9972310066223145 + }, + { + "episode": 39952, + "epoch": 0.7181220117194521, + "loss/policy_avg": 2.2432310581207275, + "lr": 2.521472392638037e-06, + "objective/entropy": 3.915924072265625, + "objective/kl": 16.655433654785156, + "objective/non_score_reward": -1.6655434370040894, + "objective/rlhf_reward": -4.262173390388488, + "objective/scores": 0.6, + "policy/approxkl_avg": 2.545776128768921, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7208318710327148, + "step": 2496, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0046825408935547 + }, + { + "episode": 39968, + "epoch": 0.7184096056368408, + "loss/policy_avg": 0.45449748635292053, + "lr": 2.521280674846626e-06, + "objective/entropy": 37.128902435302734, + "objective/kl": 8.2744140625, + "objective/non_score_reward": -0.8274414539337158, + "objective/rlhf_reward": -1.3623545867966969, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 1.7655633687973022, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.43440258502960205, + "step": 2497, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9999334812164307 + }, + { + "episode": 39984, + "epoch": 0.7186971995542294, + "loss/policy_avg": -0.03394457697868347, + "lr": 2.521088957055215e-06, + "objective/entropy": 192.6243438720703, + "objective/kl": 21.198884963989258, + "objective/non_score_reward": -2.1198887825012207, + "objective/rlhf_reward": -5.555835996509764, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 8.621723175048828, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5103436708450317, + "step": 2498, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004539489746094 + }, + { + "episode": 40000, + "epoch": 0.718984793471618, + "loss/policy_avg": -0.17127051949501038, + "lr": 2.520897239263804e-06, + "objective/entropy": -76.00393676757812, + "objective/kl": 10.881692886352539, + "objective/non_score_reward": -1.0881690979003906, + "objective/rlhf_reward": -6.3526763916015625, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.505161285400391, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8575940728187561, + "step": 2499, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.0071754455566406 + }, + { + "episode": 40016, + "epoch": 0.7192723873890067, + "loss/policy_avg": -0.1535816490650177, + "lr": 2.5207055214723927e-06, + "objective/entropy": 9.540390014648438, + "objective/kl": 21.719085693359375, + "objective/non_score_reward": -2.1719086170196533, + "objective/rlhf_reward": -4.287634468078613, + "objective/scores": 1.1, + "policy/approxkl_avg": 24.678245544433594, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7178096771240234, + "step": 2500, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.99528169631958 + }, + { + "episode": 40032, + "epoch": 0.7195599813063954, + "loss/policy_avg": 0.25635188817977905, + "lr": 2.5205138036809815e-06, + "objective/entropy": 221.4259490966797, + "objective/kl": 19.3717041015625, + "objective/non_score_reward": -1.9371705055236816, + "objective/rlhf_reward": -6.086822395742523, + "objective/scores": 0.41546487678572874, + "policy/approxkl_avg": 6.440564155578613, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5863523483276367, + "step": 2501, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994616508483887 + }, + { + "episode": 40048, + "epoch": 0.7198475752237841, + "loss/policy_avg": 0.39689481258392334, + "lr": 2.5203220858895703e-06, + "objective/entropy": -9.976652145385742, + "objective/kl": 20.1778564453125, + "objective/non_score_reward": -2.0177857875823975, + "objective/rlhf_reward": -3.6711431503295895, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.028419494628906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5400260090827942, + "step": 2502, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997795581817627 + }, + { + "episode": 40064, + "epoch": 0.7201351691411727, + "loss/policy_avg": 0.34789395332336426, + "lr": 2.5201303680981595e-06, + "objective/entropy": -121.99992370605469, + "objective/kl": 16.53069305419922, + "objective/non_score_reward": -1.6530694961547852, + "objective/rlhf_reward": -2.2122781112790104, + "objective/scores": 1.1, + "policy/approxkl_avg": 158.39163208007812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.660839319229126, + "step": 2503, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999260663986206 + }, + { + "episode": 40080, + "epoch": 0.7204227630585613, + "loss/policy_avg": 0.34718042612075806, + "lr": 2.5199386503067483e-06, + "objective/entropy": 138.98533630371094, + "objective/kl": 13.509183883666992, + "objective/non_score_reward": -1.3509184122085571, + "objective/rlhf_reward": -5.003673589229583, + "objective/scores": 0.1, + "policy/approxkl_avg": 29.142345428466797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6592766046524048, + "step": 2504, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989163875579834 + }, + { + "episode": 40096, + "epoch": 0.72071035697595, + "loss/policy_avg": 0.2920153737068176, + "lr": 2.5197469325153376e-06, + "objective/entropy": -91.42547607421875, + "objective/kl": 13.79290771484375, + "objective/non_score_reward": -1.3792908191680908, + "objective/rlhf_reward": -5.117163276672363, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.817221641540527, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7498093843460083, + "step": 2505, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0012240409851074 + }, + { + "episode": 40112, + "epoch": 0.7209979508933386, + "loss/policy_avg": 0.6532250642776489, + "lr": 2.5195552147239264e-06, + "objective/entropy": 85.36578369140625, + "objective/kl": 18.05005645751953, + "objective/non_score_reward": -1.8050055503845215, + "objective/rlhf_reward": -9.220022201538086, + "objective/scores": -0.5, + "policy/approxkl_avg": 27.8828125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5352431535720825, + "step": 2506, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9975520372390747 + }, + { + "episode": 40128, + "epoch": 0.7212855448107273, + "loss/policy_avg": 0.401783287525177, + "lr": 2.519363496932515e-06, + "objective/entropy": 89.23027801513672, + "objective/kl": 19.72549819946289, + "objective/non_score_reward": -1.9725496768951416, + "objective/rlhf_reward": -9.890198707580566, + "objective/scores": -0.5, + "policy/approxkl_avg": 70.8628158569336, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8664801716804504, + "step": 2507, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9960271120071411 + }, + { + "episode": 40144, + "epoch": 0.7215731387281159, + "loss/policy_avg": 0.2731291949748993, + "lr": 2.5191717791411044e-06, + "objective/entropy": -43.07178497314453, + "objective/kl": 20.774587631225586, + "objective/non_score_reward": -2.0774588584899902, + "objective/rlhf_reward": -6.187128903643165, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 11.19920825958252, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5203945636749268, + "step": 2508, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985531568527222 + }, + { + "episode": 40160, + "epoch": 0.7218607326455045, + "loss/policy_avg": 0.28990763425827026, + "lr": 2.5189800613496932e-06, + "objective/entropy": 278.209228515625, + "objective/kl": 16.313892364501953, + "objective/non_score_reward": -1.6313892602920532, + "objective/rlhf_reward": -6.12555713057518, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.05839920043945, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6712355613708496, + "step": 2509, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993634223937988 + }, + { + "episode": 40176, + "epoch": 0.7221483265628932, + "loss/policy_avg": 0.03489197790622711, + "lr": 2.518788343558282e-06, + "objective/entropy": -10.516448974609375, + "objective/kl": 11.802026748657227, + "objective/non_score_reward": -1.1802027225494385, + "objective/rlhf_reward": -0.320810770988464, + "objective/scores": 1.1, + "policy/approxkl_avg": 11.657503128051758, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6082391738891602, + "step": 2510, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 2.0005218982696533 + }, + { + "episode": 40192, + "epoch": 0.7224359204802818, + "loss/policy_avg": 0.2907443344593048, + "lr": 2.5185966257668713e-06, + "objective/entropy": -126.16361236572266, + "objective/kl": 15.766121864318848, + "objective/non_score_reward": -1.5766122341156006, + "objective/rlhf_reward": -1.9064491152763363, + "objective/scores": 1.1, + "policy/approxkl_avg": 4.333652973175049, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6034057140350342, + "step": 2511, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9992960691452026 + }, + { + "episode": 40208, + "epoch": 0.7227235143976705, + "loss/policy_avg": 0.2504030466079712, + "lr": 2.51840490797546e-06, + "objective/entropy": -84.10052490234375, + "objective/kl": 8.439793586730957, + "objective/non_score_reward": -0.8439792990684509, + "objective/rlhf_reward": -5.375917434692383, + "objective/scores": -0.5, + "policy/approxkl_avg": 50.38878631591797, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6781952381134033, + "step": 2512, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989845752716064 + }, + { + "episode": 40224, + "epoch": 0.7230111083150591, + "loss/policy_avg": 0.25285032391548157, + "lr": 2.5182131901840493e-06, + "objective/entropy": 234.56166076660156, + "objective/kl": 22.349105834960938, + "objective/non_score_reward": -2.234910488128662, + "objective/rlhf_reward": -6.015922997833464, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 87.60155487060547, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5998857021331787, + "step": 2513, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996349811553955 + }, + { + "episode": 40240, + "epoch": 0.7232987022324477, + "loss/policy_avg": 0.11215299367904663, + "lr": 2.518021472392638e-06, + "objective/entropy": -50.72164535522461, + "objective/kl": 14.362485885620117, + "objective/non_score_reward": -1.436248779296875, + "objective/rlhf_reward": -3.9201660930839286, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 104.22027587890625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6573004722595215, + "step": 2514, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0003345012664795 + }, + { + "episode": 40256, + "epoch": 0.7235862961498364, + "loss/policy_avg": 0.7702770233154297, + "lr": 2.517829754601227e-06, + "objective/entropy": 234.0601806640625, + "objective/kl": 11.553997039794922, + "objective/non_score_reward": -1.1553996801376343, + "objective/rlhf_reward": -6.621598720550537, + "objective/scores": -0.5, + "policy/approxkl_avg": 44.2823600769043, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7097456455230713, + "step": 2515, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001035690307617 + }, + { + "episode": 40272, + "epoch": 0.723873890067225, + "loss/policy_avg": 4.20575475692749, + "lr": 2.517638036809816e-06, + "objective/entropy": 261.3976135253906, + "objective/kl": 24.649492263793945, + "objective/non_score_reward": -2.464949131011963, + "objective/rlhf_reward": -9.459797000885011, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.047218322753906, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7010946273803711, + "step": 2516, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0022003650665283 + }, + { + "episode": 40288, + "epoch": 0.7241614839846138, + "loss/policy_avg": 0.537672221660614, + "lr": 2.517446319018405e-06, + "objective/entropy": 93.36152648925781, + "objective/kl": 14.049115180969238, + "objective/non_score_reward": -1.4049115180969238, + "objective/rlhf_reward": -2.695927296520445, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 72.57229614257812, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6079161167144775, + "step": 2517, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9962563514709473 + }, + { + "episode": 40304, + "epoch": 0.7244490779020024, + "loss/policy_avg": 0.6507375240325928, + "lr": 2.5172546012269942e-06, + "objective/entropy": -27.91612434387207, + "objective/kl": 10.094350814819336, + "objective/non_score_reward": -1.0094351768493652, + "objective/rlhf_reward": -1.9150341174760201, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 2.8608598709106445, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5371111035346985, + "step": 2518, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0003819465637207 + }, + { + "episode": 40320, + "epoch": 0.724736671819391, + "loss/policy_avg": -0.16279977560043335, + "lr": 2.517062883435583e-06, + "objective/entropy": -46.6595344543457, + "objective/kl": 16.032554626464844, + "objective/non_score_reward": -1.6032553911209106, + "objective/rlhf_reward": -2.0130215644836422, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.540762901306152, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.6262022852897644, + "step": 2519, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0026564598083496 + }, + { + "episode": 40336, + "epoch": 0.7250242657367797, + "loss/policy_avg": 0.7180226445198059, + "lr": 2.516871165644172e-06, + "objective/entropy": 162.42645263671875, + "objective/kl": 13.07403564453125, + "objective/non_score_reward": -1.307403564453125, + "objective/rlhf_reward": -4.82961413860321, + "objective/scores": 0.1, + "policy/approxkl_avg": 62.3226203918457, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6494972705841064, + "step": 2520, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9977636337280273 + }, + { + "episode": 40352, + "epoch": 0.7253118596541683, + "loss/policy_avg": 0.4439278841018677, + "lr": 2.516679447852761e-06, + "objective/entropy": 122.22246551513672, + "objective/kl": 12.850114822387695, + "objective/non_score_reward": -1.2850115299224854, + "objective/rlhf_reward": -2.216327135206434, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 5.735032081604004, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7448204755783081, + "step": 2521, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9986133575439453 + }, + { + "episode": 40368, + "epoch": 0.725599453571557, + "loss/policy_avg": 0.09070031344890594, + "lr": 2.51648773006135e-06, + "objective/entropy": 58.672027587890625, + "objective/kl": 18.210243225097656, + "objective/non_score_reward": -1.8210242986679077, + "objective/rlhf_reward": -6.8840971946716305, + "objective/scores": 0.1, + "policy/approxkl_avg": 57.76567840576172, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.4609532356262207, + "step": 2522, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998521089553833 + }, + { + "episode": 40384, + "epoch": 0.7258870474889456, + "loss/policy_avg": 0.03193806856870651, + "lr": 2.5162960122699387e-06, + "objective/entropy": 177.22987365722656, + "objective/kl": 16.193603515625, + "objective/non_score_reward": -1.619360327720642, + "objective/rlhf_reward": -6.0774414300918576, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.085994243621826, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.5824298858642578, + "step": 2523, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001129150390625 + }, + { + "episode": 40400, + "epoch": 0.7261746414063343, + "loss/policy_avg": 0.5782080292701721, + "lr": 2.5161042944785275e-06, + "objective/entropy": 58.92469787597656, + "objective/kl": 16.6646728515625, + "objective/non_score_reward": -1.6664674282073975, + "objective/rlhf_reward": -6.265869921445846, + "objective/scores": 0.1, + "policy/approxkl_avg": 41.5260009765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6903586387634277, + "step": 2524, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9980307817459106 + }, + { + "episode": 40416, + "epoch": 0.7264622353237229, + "loss/policy_avg": 0.09085953235626221, + "lr": 2.5159125766871163e-06, + "objective/entropy": 176.7783203125, + "objective/kl": 15.784828186035156, + "objective/non_score_reward": -1.5784828662872314, + "objective/rlhf_reward": -5.913931345939636, + "objective/scores": 0.1, + "policy/approxkl_avg": 16.374055862426758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6402093172073364, + "step": 2525, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.99851655960083 + }, + { + "episode": 40432, + "epoch": 0.7267498292411115, + "loss/policy_avg": 0.10389601439237595, + "lr": 2.5157208588957055e-06, + "objective/entropy": -147.4564208984375, + "objective/kl": 16.93805694580078, + "objective/non_score_reward": -1.6938056945800781, + "objective/rlhf_reward": -6.375222897529602, + "objective/scores": 0.1, + "policy/approxkl_avg": 11.362157821655273, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7070935964584351, + "step": 2526, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.002185344696045 + }, + { + "episode": 40448, + "epoch": 0.7270374231585002, + "loss/policy_avg": 3.5322492122650146, + "lr": 2.5155291411042944e-06, + "objective/entropy": 70.91726684570312, + "objective/kl": 11.799054145812988, + "objective/non_score_reward": -1.1799055337905884, + "objective/rlhf_reward": -4.3196221388876435, + "objective/scores": 0.1, + "policy/approxkl_avg": 30.683063507080078, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.4909411668777466, + "step": 2527, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0027623176574707 + }, + { + "episode": 40464, + "epoch": 0.7273250170758888, + "loss/policy_avg": 0.985468864440918, + "lr": 2.5153374233128836e-06, + "objective/entropy": -76.50721740722656, + "objective/kl": 14.389606475830078, + "objective/non_score_reward": -1.4389605522155762, + "objective/rlhf_reward": -1.3558421492576596, + "objective/scores": 1.1, + "policy/approxkl_avg": 14.244632720947266, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.662421464920044, + "step": 2528, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9981179237365723 + }, + { + "episode": 40480, + "epoch": 0.7276126109932775, + "loss/policy_avg": 0.32100650668144226, + "lr": 2.5151457055214724e-06, + "objective/entropy": 136.02963256835938, + "objective/kl": 15.310518264770508, + "objective/non_score_reward": -1.5310518741607666, + "objective/rlhf_reward": -5.724207437038421, + "objective/scores": 0.1, + "policy/approxkl_avg": 47.59115219116211, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7093237638473511, + "step": 2529, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.0003662109375 + }, + { + "episode": 40496, + "epoch": 0.7279002049106661, + "loss/policy_avg": 0.3680607080459595, + "lr": 2.514953987730061e-06, + "objective/entropy": -45.524776458740234, + "objective/kl": 17.739091873168945, + "objective/non_score_reward": -1.7739092111587524, + "objective/rlhf_reward": -6.695636904239654, + "objective/scores": 0.1, + "policy/approxkl_avg": 13.970775604248047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49509182572364807, + "step": 2530, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9971013069152832 + }, + { + "episode": 40512, + "epoch": 0.7281877988280547, + "loss/policy_avg": 0.40724849700927734, + "lr": 2.5147622699386504e-06, + "objective/entropy": 249.31178283691406, + "objective/kl": 21.762939453125, + "objective/non_score_reward": -2.1762938499450684, + "objective/rlhf_reward": -8.30517611503601, + "objective/scores": 0.1, + "policy/approxkl_avg": 39.761436462402344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8204064965248108, + "step": 2531, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9969172477722168 + }, + { + "episode": 40528, + "epoch": 0.7284753927454435, + "loss/policy_avg": 0.72802734375, + "lr": 2.5145705521472392e-06, + "objective/entropy": 81.80903625488281, + "objective/kl": 12.155574798583984, + "objective/non_score_reward": -1.21555757522583, + "objective/rlhf_reward": -6.86223030090332, + "objective/scores": -0.5, + "policy/approxkl_avg": 11.317900657653809, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7858550548553467, + "step": 2532, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.999354362487793 + }, + { + "episode": 40544, + "epoch": 0.7287629866628321, + "loss/policy_avg": 0.17164215445518494, + "lr": 2.5143788343558285e-06, + "objective/entropy": 77.85188293457031, + "objective/kl": 12.587995529174805, + "objective/non_score_reward": -1.25879967212677, + "objective/rlhf_reward": -2.1114797338258953, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 36.097679138183594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5762654542922974, + "step": 2533, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973692893981934 + }, + { + "episode": 40560, + "epoch": 0.7290505805802208, + "loss/policy_avg": 0.3810919523239136, + "lr": 2.5141871165644173e-06, + "objective/entropy": 80.29145050048828, + "objective/kl": 13.833352088928223, + "objective/non_score_reward": -1.3833352327346802, + "objective/rlhf_reward": -7.533340930938721, + "objective/scores": -0.5, + "policy/approxkl_avg": 19.537090301513672, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4961831569671631, + "step": 2534, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9968695640563965 + }, + { + "episode": 40576, + "epoch": 0.7293381744976094, + "loss/policy_avg": 0.0897863432765007, + "lr": 2.513995398773006e-06, + "objective/entropy": 77.95818328857422, + "objective/kl": 17.291271209716797, + "objective/non_score_reward": -1.7291271686553955, + "objective/rlhf_reward": -5.312388826076107, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 46.346221923828125, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6265320181846619, + "step": 2535, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9968516826629639 + }, + { + "episode": 40592, + "epoch": 0.729625768414998, + "loss/policy_avg": -0.05747275426983833, + "lr": 2.5138036809815953e-06, + "objective/entropy": 202.11474609375, + "objective/kl": 17.736324310302734, + "objective/non_score_reward": -1.773632526397705, + "objective/rlhf_reward": -2.6945299863815304, + "objective/scores": 1.1, + "policy/approxkl_avg": 31.00056266784668, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.9188692569732666, + "step": 2536, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9991137981414795 + }, + { + "episode": 40608, + "epoch": 0.7299133623323867, + "loss/policy_avg": 0.08236008137464523, + "lr": 2.513611963190184e-06, + "objective/entropy": -31.01387596130371, + "objective/kl": 20.521949768066406, + "objective/non_score_reward": -2.052194595336914, + "objective/rlhf_reward": -10.208778381347656, + "objective/scores": -0.5, + "policy/approxkl_avg": 9.510942459106445, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51418536901474, + "step": 2537, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998669147491455 + }, + { + "episode": 40624, + "epoch": 0.7302009562497753, + "loss/policy_avg": 0.2037382423877716, + "lr": 2.513420245398773e-06, + "objective/entropy": -219.51242065429688, + "objective/kl": 11.540672302246094, + "objective/non_score_reward": -1.1540671586990356, + "objective/rlhf_reward": -0.216268694400787, + "objective/scores": 1.1, + "policy/approxkl_avg": 6.083378314971924, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5459791421890259, + "step": 2538, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.0007717609405518 + }, + { + "episode": 40640, + "epoch": 0.730488550167164, + "loss/policy_avg": 0.09275393187999725, + "lr": 2.513228527607362e-06, + "objective/entropy": -0.581573486328125, + "objective/kl": 12.143417358398438, + "objective/non_score_reward": -1.2143417596817017, + "objective/rlhf_reward": -4.457367038726806, + "objective/scores": 0.1, + "policy/approxkl_avg": 31.044187545776367, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7604504823684692, + "step": 2539, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9973387718200684 + }, + { + "episode": 40656, + "epoch": 0.7307761440845526, + "loss/policy_avg": -0.06498684734106064, + "lr": 2.513036809815951e-06, + "objective/entropy": 31.954769134521484, + "objective/kl": 12.22402572631836, + "objective/non_score_reward": -1.2224026918411255, + "objective/rlhf_reward": -4.489610707759857, + "objective/scores": 0.1, + "policy/approxkl_avg": 4.74851131439209, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3908454179763794, + "step": 2540, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000835418701172 + }, + { + "episode": 40672, + "epoch": 0.7310637380019412, + "loss/policy_avg": 0.24499952793121338, + "lr": 2.5128450920245402e-06, + "objective/entropy": 206.56790161132812, + "objective/kl": 11.419059753417969, + "objective/non_score_reward": -1.1419059038162231, + "objective/rlhf_reward": -4.1676237940788265, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.2242417335510254, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5385485887527466, + "step": 2541, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006425380706787 + }, + { + "episode": 40688, + "epoch": 0.7313513319193299, + "loss/policy_avg": 0.5701605677604675, + "lr": 2.512653374233129e-06, + "objective/entropy": -62.02988052368164, + "objective/kl": 16.88791275024414, + "objective/non_score_reward": -1.6887911558151245, + "objective/rlhf_reward": -2.355164802074432, + "objective/scores": 1.1, + "policy/approxkl_avg": 11.32274341583252, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6799558401107788, + "step": 2542, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9978663921356201 + }, + { + "episode": 40704, + "epoch": 0.7316389258367185, + "loss/policy_avg": 1.1101778745651245, + "lr": 2.512461656441718e-06, + "objective/entropy": 139.62664794921875, + "objective/kl": 14.743820190429688, + "objective/non_score_reward": -1.4743821620941162, + "objective/rlhf_reward": -1.4975284099578854, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.301300048828125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7435659170150757, + "step": 2543, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000453233718872 + }, + { + "episode": 40720, + "epoch": 0.7319265197541072, + "loss/policy_avg": 0.050421155989170074, + "lr": 2.512269938650307e-06, + "objective/entropy": 117.1237564086914, + "objective/kl": 21.02765464782715, + "objective/non_score_reward": -2.1027655601501465, + "objective/rlhf_reward": -8.011061763763427, + "objective/scores": 0.1, + "policy/approxkl_avg": 3.9429683685302734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.565706729888916, + "step": 2544, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006346702575684 + }, + { + "episode": 40736, + "epoch": 0.7322141136714958, + "loss/policy_avg": 0.22848790884017944, + "lr": 2.512078220858896e-06, + "objective/entropy": -54.253082275390625, + "objective/kl": 14.824952125549316, + "objective/non_score_reward": -1.4824953079223633, + "objective/rlhf_reward": -7.929981231689453, + "objective/scores": -0.5, + "policy/approxkl_avg": 35.86559295654297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4033016264438629, + "step": 2545, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9994404315948486 + }, + { + "episode": 40752, + "epoch": 0.7325017075888844, + "loss/policy_avg": 0.031839221715927124, + "lr": 2.5118865030674847e-06, + "objective/entropy": 117.71728515625, + "objective/kl": 17.298198699951172, + "objective/non_score_reward": -1.729819893836975, + "objective/rlhf_reward": -4.971868406014378, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 27.860347747802734, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.45791611075401306, + "step": 2546, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0025744438171387 + }, + { + "episode": 40768, + "epoch": 0.7327893015062732, + "loss/policy_avg": 0.13714680075645447, + "lr": 2.5116947852760735e-06, + "objective/entropy": -94.83277130126953, + "objective/kl": 12.307830810546875, + "objective/non_score_reward": -1.2307831048965454, + "objective/rlhf_reward": -1.9994134649049966, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.999788761138916, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5606850385665894, + "step": 2547, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9989594221115112 + }, + { + "episode": 40784, + "epoch": 0.7330768954236618, + "loss/policy_avg": 0.16052904725074768, + "lr": 2.5115030674846623e-06, + "objective/entropy": 106.58887481689453, + "objective/kl": 17.21367835998535, + "objective/non_score_reward": -1.7213678359985352, + "objective/rlhf_reward": -6.48547134399414, + "objective/scores": 0.1, + "policy/approxkl_avg": 34.78619384765625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.710265576839447, + "step": 2548, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9989547729492188 + }, + { + "episode": 40800, + "epoch": 0.7333644893410505, + "loss/policy_avg": 0.3068848252296448, + "lr": 2.5113113496932516e-06, + "objective/entropy": 184.2476348876953, + "objective/kl": 16.913951873779297, + "objective/non_score_reward": -1.6913950443267822, + "objective/rlhf_reward": -4.642874183432136, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 63.81663513183594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.638231635093689, + "step": 2549, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9975179433822632 + }, + { + "episode": 40816, + "epoch": 0.7336520832584391, + "loss/policy_avg": 0.26979178190231323, + "lr": 2.5111196319018404e-06, + "objective/entropy": 90.11837768554688, + "objective/kl": 11.649972915649414, + "objective/non_score_reward": -1.1649973392486572, + "objective/rlhf_reward": -0.2599894165992733, + "objective/scores": 1.1, + "policy/approxkl_avg": 44.92051696777344, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7661033272743225, + "step": 2550, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9970160722732544 + }, + { + "episode": 40832, + "epoch": 0.7339396771758278, + "loss/policy_avg": 0.1557759940624237, + "lr": 2.5109279141104296e-06, + "objective/entropy": 61.577484130859375, + "objective/kl": 14.083431243896484, + "objective/non_score_reward": -1.4083431959152222, + "objective/rlhf_reward": -4.077113448587015, + "objective/scores": 0.38906482631788786, + "policy/approxkl_avg": 18.093772888183594, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5893636345863342, + "step": 2551, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003867149353027 + }, + { + "episode": 40848, + "epoch": 0.7342272710932164, + "loss/policy_avg": 0.5789381265640259, + "lr": 2.5107361963190184e-06, + "objective/entropy": 151.58245849609375, + "objective/kl": 13.456867218017578, + "objective/non_score_reward": -1.345686674118042, + "objective/rlhf_reward": -3.26004076220182, + "objective/scores": 0.5306765580733931, + "policy/approxkl_avg": 7.168601989746094, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5869942903518677, + "step": 2552, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000091791152954 + }, + { + "episode": 40864, + "epoch": 0.734514865010605, + "loss/policy_avg": 0.27852749824523926, + "lr": 2.5105444785276072e-06, + "objective/entropy": -60.68942642211914, + "objective/kl": 12.308164596557617, + "objective/non_score_reward": -1.2308166027069092, + "objective/rlhf_reward": -6.923266410827637, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.5334177017211914, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7421091794967651, + "step": 2553, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9989408254623413 + }, + { + "episode": 40880, + "epoch": 0.7348024589279937, + "loss/policy_avg": 0.04276307299733162, + "lr": 2.5103527607361964e-06, + "objective/entropy": 165.76051330566406, + "objective/kl": 18.468660354614258, + "objective/non_score_reward": -1.8468658924102783, + "objective/rlhf_reward": -6.987463629245758, + "objective/scores": 0.1, + "policy/approxkl_avg": 25.74111557006836, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4159314036369324, + "step": 2554, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000864028930664 + }, + { + "episode": 40896, + "epoch": 0.7350900528453823, + "loss/policy_avg": 0.31651490926742554, + "lr": 2.5101610429447853e-06, + "objective/entropy": 52.57521057128906, + "objective/kl": 20.297330856323242, + "objective/non_score_reward": -2.029733180999756, + "objective/rlhf_reward": -6.171521733479436, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 51.515777587890625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6293849945068359, + "step": 2555, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9976792335510254 + }, + { + "episode": 40912, + "epoch": 0.735377646762771, + "loss/policy_avg": 0.45035499334335327, + "lr": 2.5099693251533745e-06, + "objective/entropy": -4.227651596069336, + "objective/kl": 15.972675323486328, + "objective/non_score_reward": -1.5972673892974854, + "objective/rlhf_reward": -5.989069706201553, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.464114189147949, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5598526000976562, + "step": 2556, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9997695684432983 + }, + { + "episode": 40928, + "epoch": 0.7356652406801596, + "loss/policy_avg": 0.21771469712257385, + "lr": 2.5097776073619633e-06, + "objective/entropy": 220.2686767578125, + "objective/kl": 11.255245208740234, + "objective/non_score_reward": -1.1255245208740234, + "objective/rlhf_reward": -6.502098083496094, + "objective/scores": -0.5, + "policy/approxkl_avg": 5.5175018310546875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5174956321716309, + "step": 2557, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979242086410522 + }, + { + "episode": 40944, + "epoch": 0.7359528345975482, + "loss/policy_avg": 0.8126636743545532, + "lr": 2.509585889570552e-06, + "objective/entropy": -18.99462127685547, + "objective/kl": 15.622753143310547, + "objective/non_score_reward": -1.5622755289077759, + "objective/rlhf_reward": -1.849102249741554, + "objective/scores": 1.1, + "policy/approxkl_avg": 10.049933433532715, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5314970016479492, + "step": 2558, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9989418983459473 + }, + { + "episode": 40960, + "epoch": 0.7362404285149369, + "loss/policy_avg": 0.1002318486571312, + "lr": 2.5093941717791413e-06, + "objective/entropy": -117.28474426269531, + "objective/kl": 11.140116691589355, + "objective/non_score_reward": -1.1140117645263672, + "objective/rlhf_reward": -6.456047058105469, + "objective/scores": -0.5, + "policy/approxkl_avg": 4.838708877563477, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7269535064697266, + "step": 2559, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.9991798400878906 + }, + { + "episode": 40976, + "epoch": 0.7365280224323255, + "loss/policy_avg": 0.0044472068548202515, + "lr": 2.50920245398773e-06, + "objective/entropy": 99.82304382324219, + "objective/kl": 13.302616119384766, + "objective/non_score_reward": -1.3302617073059082, + "objective/rlhf_reward": -0.9210467696189877, + "objective/scores": 1.1, + "policy/approxkl_avg": 21.16503143310547, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5178516507148743, + "step": 2560, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003678321838379 + }, + { + "episode": 40992, + "epoch": 0.7368156163497142, + "loss/policy_avg": 0.017942823469638824, + "lr": 2.509010736196319e-06, + "objective/entropy": -7.6581573486328125, + "objective/kl": 16.165346145629883, + "objective/non_score_reward": -1.6165344715118408, + "objective/rlhf_reward": -8.466137886047363, + "objective/scores": -0.5, + "policy/approxkl_avg": 41.765777587890625, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7504334449768066, + "step": 2561, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9999984502792358 + }, + { + "episode": 41008, + "epoch": 0.7371032102671029, + "loss/policy_avg": 0.3086368143558502, + "lr": 2.508819018404908e-06, + "objective/entropy": -166.00128173828125, + "objective/kl": 15.402877807617188, + "objective/non_score_reward": -1.540287733078003, + "objective/rlhf_reward": -3.237431828619215, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 15.016733169555664, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.443015456199646, + "step": 2562, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 2.0006279945373535 + }, + { + "episode": 41024, + "epoch": 0.7373908041844915, + "loss/policy_avg": 0.22254131734371185, + "lr": 2.508627300613497e-06, + "objective/entropy": 139.58291625976562, + "objective/kl": 13.144253730773926, + "objective/non_score_reward": -1.3144253492355347, + "objective/rlhf_reward": -0.8577013969421383, + "objective/scores": 1.1, + "policy/approxkl_avg": 11.086759567260742, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3470708131790161, + "step": 2563, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007386207580566 + }, + { + "episode": 41040, + "epoch": 0.7376783981018802, + "loss/policy_avg": 0.41101884841918945, + "lr": 2.5084355828220862e-06, + "objective/entropy": -132.093505859375, + "objective/kl": 11.1688871383667, + "objective/non_score_reward": -1.1168886423110962, + "objective/rlhf_reward": -4.067554539442062, + "objective/scores": 0.1, + "policy/approxkl_avg": 28.847564697265625, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5715075135231018, + "step": 2564, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000303268432617 + }, + { + "episode": 41056, + "epoch": 0.7379659920192688, + "loss/policy_avg": 0.049172185361385345, + "lr": 2.508243865030675e-06, + "objective/entropy": 130.5298614501953, + "objective/kl": 16.698543548583984, + "objective/non_score_reward": -1.6698545217514038, + "objective/rlhf_reward": -6.279418087005615, + "objective/scores": 0.1, + "policy/approxkl_avg": 7.322670936584473, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7057923078536987, + "step": 2565, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988740682601929 + }, + { + "episode": 41072, + "epoch": 0.7382535859366575, + "loss/policy_avg": 0.3305116891860962, + "lr": 2.508052147239264e-06, + "objective/entropy": -9.86077880859375, + "objective/kl": 7.6794633865356445, + "objective/non_score_reward": -0.7679464817047119, + "objective/rlhf_reward": -1.3384524444739023, + "objective/scores": 0.43333333333333335, + "policy/approxkl_avg": 4.693934917449951, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.559454083442688, + "step": 2566, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.000784158706665 + }, + { + "episode": 41088, + "epoch": 0.7385411798540461, + "loss/policy_avg": 0.23565122485160828, + "lr": 2.507860429447853e-06, + "objective/entropy": 116.38072967529297, + "objective/kl": 10.964803695678711, + "objective/non_score_reward": -1.096480369567871, + "objective/rlhf_reward": -3.9859213441610333, + "objective/scores": 0.1, + "policy/approxkl_avg": 1.9331263303756714, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49167805910110474, + "step": 2567, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000530242919922 + }, + { + "episode": 41104, + "epoch": 0.7388287737714347, + "loss/policy_avg": -0.2451452612876892, + "lr": 2.5076687116564415e-06, + "objective/entropy": 224.76869201660156, + "objective/kl": 15.328838348388672, + "objective/non_score_reward": -1.532883882522583, + "objective/rlhf_reward": -1.7315352618694302, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.784040451049805, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5386008024215698, + "step": 2568, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0049238204956055 + }, + { + "episode": 41120, + "epoch": 0.7391163676888234, + "loss/policy_avg": 0.5509124994277954, + "lr": 2.5074769938650307e-06, + "objective/entropy": 163.45675659179688, + "objective/kl": 11.174448013305664, + "objective/non_score_reward": -1.1174447536468506, + "objective/rlhf_reward": -4.0697791337966915, + "objective/scores": 0.1, + "policy/approxkl_avg": 9.825078010559082, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.4163983464241028, + "step": 2569, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999847173690796 + }, + { + "episode": 41136, + "epoch": 0.739403961606212, + "loss/policy_avg": 0.39133358001708984, + "lr": 2.5072852760736195e-06, + "objective/entropy": 57.593509674072266, + "objective/kl": 19.556625366210938, + "objective/non_score_reward": -1.955662488937378, + "objective/rlhf_reward": -9.822649955749512, + "objective/scores": -0.5, + "policy/approxkl_avg": 15.338714599609375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5403202772140503, + "step": 2570, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9974697828292847 + }, + { + "episode": 41152, + "epoch": 0.7396915555236007, + "loss/policy_avg": 0.2464519590139389, + "lr": 2.5070935582822087e-06, + "objective/entropy": -43.940223693847656, + "objective/kl": 12.578645706176758, + "objective/non_score_reward": -1.2578647136688232, + "objective/rlhf_reward": -3.427338603798466, + "objective/scores": 0.40102999566398123, + "policy/approxkl_avg": 9.57274341583252, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7311115264892578, + "step": 2571, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999946117401123 + }, + { + "episode": 41168, + "epoch": 0.7399791494409893, + "loss/policy_avg": 0.5888267755508423, + "lr": 2.5069018404907976e-06, + "objective/entropy": 31.491531372070312, + "objective/kl": 13.00638198852539, + "objective/non_score_reward": -1.3006383180618286, + "objective/rlhf_reward": -4.802553451061248, + "objective/scores": 0.1, + "policy/approxkl_avg": 8.621150016784668, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6433630585670471, + "step": 2572, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998997688293457 + }, + { + "episode": 41184, + "epoch": 0.7402667433583779, + "loss/policy_avg": 0.030804350972175598, + "lr": 2.5067101226993864e-06, + "objective/entropy": 54.20383071899414, + "objective/kl": 15.847734451293945, + "objective/non_score_reward": -1.5847735404968262, + "objective/rlhf_reward": -3.939094221591949, + "objective/scores": 0.6, + "policy/approxkl_avg": 3.758124351501465, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3955601453781128, + "step": 2573, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991270303726196 + }, + { + "episode": 41200, + "epoch": 0.7405543372757666, + "loss/policy_avg": -0.06783771514892578, + "lr": 2.5065184049079756e-06, + "objective/entropy": 102.63035583496094, + "objective/kl": 18.602985382080078, + "objective/non_score_reward": -1.8602986335754395, + "objective/rlhf_reward": -7.041194236278534, + "objective/scores": 0.1, + "policy/approxkl_avg": 8.853973388671875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.7215365767478943, + "step": 2574, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9993562698364258 + }, + { + "episode": 41216, + "epoch": 0.7408419311931552, + "loss/policy_avg": 0.24368295073509216, + "lr": 2.5063266871165644e-06, + "objective/entropy": 21.702545166015625, + "objective/kl": 14.640352249145508, + "objective/non_score_reward": -1.4640350341796875, + "objective/rlhf_reward": -2.9324214130055637, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 7.804903984069824, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.8155585527420044, + "step": 2575, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 2.00089168548584 + }, + { + "episode": 41232, + "epoch": 0.7411295251105439, + "loss/policy_avg": 0.3724660575389862, + "lr": 2.5061349693251532e-06, + "objective/entropy": -147.16192626953125, + "objective/kl": 20.30790901184082, + "objective/non_score_reward": -2.0307908058166504, + "objective/rlhf_reward": -7.723163878917694, + "objective/scores": 0.1, + "policy/approxkl_avg": 21.299043655395508, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5898343920707703, + "step": 2576, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9970901012420654 + }, + { + "episode": 41248, + "epoch": 0.7414171190279326, + "loss/policy_avg": 0.19356757402420044, + "lr": 2.5059432515337425e-06, + "objective/entropy": -0.8742942810058594, + "objective/kl": 12.401459693908691, + "objective/non_score_reward": -1.2401460409164429, + "objective/rlhf_reward": -4.560584163665771, + "objective/scores": 0.1, + "policy/approxkl_avg": 9.509387016296387, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5665322542190552, + "step": 2577, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 6, + "val/ratio": 1.9999457597732544 + }, + { + "episode": 41264, + "epoch": 0.7417047129453213, + "loss/policy_avg": -0.13321802020072937, + "lr": 2.5057515337423313e-06, + "objective/entropy": 62.121734619140625, + "objective/kl": 16.56624984741211, + "objective/non_score_reward": -1.6566250324249268, + "objective/rlhf_reward": -2.2265001296997067, + "objective/scores": 1.1, + "policy/approxkl_avg": 1.7434561252593994, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7118167877197266, + "step": 2578, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00130558013916 + }, + { + "episode": 41280, + "epoch": 0.7419923068627099, + "loss/policy_avg": 0.7044010758399963, + "lr": 2.5055598159509205e-06, + "objective/entropy": -73.61532592773438, + "objective/kl": 11.75814437866211, + "objective/non_score_reward": -1.1758145093917847, + "objective/rlhf_reward": -0.3032580226659771, + "objective/scores": 1.1, + "policy/approxkl_avg": 0.6257975101470947, + "policy/clipfrac_avg": 0.25, + "policy/entropy_avg": 0.5945839285850525, + "step": 2579, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.001973867416382 + }, + { + "episode": 41296, + "epoch": 0.7422799007800985, + "loss/policy_avg": 1.1437344551086426, + "lr": 2.5053680981595093e-06, + "objective/entropy": 47.38404846191406, + "objective/kl": 16.088647842407227, + "objective/non_score_reward": -1.6088650226593018, + "objective/rlhf_reward": -6.035459733009338, + "objective/scores": 0.1, + "policy/approxkl_avg": 25.359088897705078, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6521689891815186, + "step": 2580, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.997725248336792 + }, + { + "episode": 41312, + "epoch": 0.7425674946974872, + "loss/policy_avg": 0.6333919763565063, + "lr": 2.505176380368098e-06, + "objective/entropy": 110.00313568115234, + "objective/kl": 17.45827865600586, + "objective/non_score_reward": -1.7458280324935913, + "objective/rlhf_reward": -2.5833120107650753, + "objective/scores": 1.1, + "policy/approxkl_avg": 49.832862854003906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6787171363830566, + "step": 2581, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9961440563201904 + }, + { + "episode": 41328, + "epoch": 0.7428550886148758, + "loss/policy_avg": 0.7040215134620667, + "lr": 2.5049846625766873e-06, + "objective/entropy": -68.08572387695312, + "objective/kl": 14.66339111328125, + "objective/non_score_reward": -1.466339111328125, + "objective/rlhf_reward": -5.465356385707855, + "objective/scores": 0.1, + "policy/approxkl_avg": 35.840213775634766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5003482103347778, + "step": 2582, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 2.000117778778076 + }, + { + "episode": 41344, + "epoch": 0.7431426825322645, + "loss/policy_avg": 0.9921929240226746, + "lr": 2.504792944785276e-06, + "objective/entropy": -46.91489791870117, + "objective/kl": 16.757122039794922, + "objective/non_score_reward": -1.6757123470306396, + "objective/rlhf_reward": -4.302849507331848, + "objective/scores": 0.6, + "policy/approxkl_avg": 13.285392761230469, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5311835408210754, + "step": 2583, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 4, + "val/ratio": 1.9990557432174683 + }, + { + "episode": 41360, + "epoch": 0.7434302764496531, + "loss/policy_avg": 0.36089640855789185, + "lr": 2.5046012269938654e-06, + "objective/entropy": 174.65965270996094, + "objective/kl": 9.799924850463867, + "objective/non_score_reward": -0.9799926280975342, + "objective/rlhf_reward": -3.5199706166982647, + "objective/scores": 0.1, + "policy/approxkl_avg": 24.74539566040039, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.7819864749908447, + "step": 2584, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997823238372803 + }, + { + "episode": 41376, + "epoch": 0.7437178703670417, + "loss/policy_avg": 0.7528730630874634, + "lr": 2.504409509202454e-06, + "objective/entropy": 7.559715270996094, + "objective/kl": 9.195741653442383, + "objective/non_score_reward": -0.9195740222930908, + "objective/rlhf_reward": 0.7217038810253147, + "objective/scores": 1.1, + "policy/approxkl_avg": 16.911163330078125, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5018427968025208, + "step": 2585, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978995323181152 + }, + { + "episode": 41392, + "epoch": 0.7440054642844304, + "loss/policy_avg": 0.43830037117004395, + "lr": 2.504217791411043e-06, + "objective/entropy": 120.500732421875, + "objective/kl": 16.796005249023438, + "objective/non_score_reward": -1.679600477218628, + "objective/rlhf_reward": -8.718401908874512, + "objective/scores": -0.5, + "policy/approxkl_avg": 33.07453155517578, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.5435898303985596, + "step": 2586, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9985113143920898 + }, + { + "episode": 41408, + "epoch": 0.744293058201819, + "loss/policy_avg": -0.25337135791778564, + "lr": 2.5040260736196322e-06, + "objective/entropy": 19.597820281982422, + "objective/kl": 8.86826229095459, + "objective/non_score_reward": -0.8868262767791748, + "objective/rlhf_reward": -0.6235860034239022, + "objective/scores": 0.7309297535714575, + "policy/approxkl_avg": 9.556246757507324, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.3631720542907715, + "step": 2587, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0048668384552 + }, + { + "episode": 41424, + "epoch": 0.7445806521192077, + "loss/policy_avg": 0.009761861525475979, + "lr": 2.503834355828221e-06, + "objective/entropy": 164.787353515625, + "objective/kl": 17.39581298828125, + "objective/non_score_reward": -1.7395814657211304, + "objective/rlhf_reward": -6.558325624465942, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.974054336547852, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.787891149520874, + "step": 2588, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001471519470215 + }, + { + "episode": 41440, + "epoch": 0.7448682460365963, + "loss/policy_avg": 0.15249930322170258, + "lr": 2.50364263803681e-06, + "objective/entropy": 4.250644683837891, + "objective/kl": 13.433493614196777, + "objective/non_score_reward": -1.3433494567871094, + "objective/rlhf_reward": -2.973397678136825, + "objective/scores": 0.6, + "policy/approxkl_avg": 10.847415924072266, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6573472023010254, + "step": 2589, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.998237133026123 + }, + { + "episode": 41456, + "epoch": 0.7451558399539849, + "loss/policy_avg": 1.9638333320617676, + "lr": 2.5034509202453987e-06, + "objective/entropy": 97.89153289794922, + "objective/kl": 10.478702545166016, + "objective/non_score_reward": -1.04787015914917, + "objective/rlhf_reward": -6.191481113433838, + "objective/scores": -0.5, + "policy/approxkl_avg": 3.616457462310791, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5932775735855103, + "step": 2590, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001009464263916 + }, + { + "episode": 41472, + "epoch": 0.7454434338713736, + "loss/policy_avg": 0.4018653929233551, + "lr": 2.5032592024539875e-06, + "objective/entropy": -210.961181640625, + "objective/kl": 6.5700788497924805, + "objective/non_score_reward": -0.6570079326629639, + "objective/rlhf_reward": -2.2280315816402436, + "objective/scores": 0.1, + "policy/approxkl_avg": 2.616713047027588, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6298050880432129, + "step": 2591, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0021932125091553 + }, + { + "episode": 41488, + "epoch": 0.7457310277887622, + "loss/policy_avg": 0.4341945946216583, + "lr": 2.5030674846625767e-06, + "objective/entropy": 31.865272521972656, + "objective/kl": 12.279970169067383, + "objective/non_score_reward": -1.227997064590454, + "objective/rlhf_reward": -6.911988258361816, + "objective/scores": -0.5, + "policy/approxkl_avg": 22.766761779785156, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.2969363331794739, + "step": 2592, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997770071029663 + }, + { + "episode": 41504, + "epoch": 0.746018621706151, + "loss/policy_avg": -0.019156618043780327, + "lr": 2.5028757668711655e-06, + "objective/entropy": -139.5720977783203, + "objective/kl": 15.742167472839355, + "objective/non_score_reward": -1.5742168426513672, + "objective/rlhf_reward": -8.296867370605469, + "objective/scores": -0.5, + "policy/approxkl_avg": 81.86674499511719, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.8666651248931885, + "step": 2593, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.9980387687683105 + }, + { + "episode": 41520, + "epoch": 0.7463062156235396, + "loss/policy_avg": 0.06185273453593254, + "lr": 2.5026840490797548e-06, + "objective/entropy": 189.14340209960938, + "objective/kl": 16.34478759765625, + "objective/non_score_reward": -1.6344788074493408, + "objective/rlhf_reward": -8.537915229797363, + "objective/scores": -0.5, + "policy/approxkl_avg": 70.36660766601562, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.7835408449172974, + "step": 2594, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999415874481201 + }, + { + "episode": 41536, + "epoch": 0.7465938095409282, + "loss/policy_avg": 0.22691065073013306, + "lr": 2.5024923312883436e-06, + "objective/entropy": 39.11201477050781, + "objective/kl": 19.617210388183594, + "objective/non_score_reward": -1.9617210626602173, + "objective/rlhf_reward": -3.446884578466415, + "objective/scores": 1.1, + "policy/approxkl_avg": 23.82514190673828, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6113545894622803, + "step": 2595, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 2.00091552734375 + }, + { + "episode": 41552, + "epoch": 0.7468814034583169, + "loss/policy_avg": 0.05605784058570862, + "lr": 2.5023006134969324e-06, + "objective/entropy": -56.80852508544922, + "objective/kl": 11.131508827209473, + "objective/non_score_reward": -1.113150954246521, + "objective/rlhf_reward": -0.05260381698608363, + "objective/scores": 1.1, + "policy/approxkl_avg": 7.18161678314209, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.6825131177902222, + "step": 2596, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 5, + "val/ratio": 1.999211311340332 + }, + { + "episode": 41568, + "epoch": 0.7471689973757055, + "loss/policy_avg": 3.5967116355895996, + "lr": 2.5021088957055216e-06, + "objective/entropy": -195.31704711914062, + "objective/kl": 16.412227630615234, + "objective/non_score_reward": -1.6412227153778076, + "objective/rlhf_reward": -6.164890921115875, + "objective/scores": 0.1, + "policy/approxkl_avg": 5.1165666580200195, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.8007971048355103, + "step": 2597, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 8, + "val/ratio": 2.0114803314208984 + }, + { + "episode": 41584, + "epoch": 0.7474565912930942, + "loss/policy_avg": 0.029494240880012512, + "lr": 2.5019171779141104e-06, + "objective/entropy": -298.34112548828125, + "objective/kl": 10.9917631149292, + "objective/non_score_reward": -1.099176287651062, + "objective/rlhf_reward": -2.571876461776804, + "objective/scores": 0.4562071871080222, + "policy/approxkl_avg": 11.627656936645508, + "policy/clipfrac_avg": 1.25, + "policy/entropy_avg": 0.7864717841148376, + "step": 2598, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 7, + "val/ratio": 1.998002052307129 + }, + { + "episode": 41600, + "epoch": 0.7477441852104828, + "loss/policy_avg": 2.6379692554473877, + "lr": 2.5017254601226992e-06, + "objective/entropy": 120.20051574707031, + "objective/kl": 20.99555206298828, + "objective/non_score_reward": -2.099555253982544, + "objective/rlhf_reward": -6.4508099062012985, + "objective/scores": 0.4868528072345416, + "policy/approxkl_avg": 7.775458335876465, + "policy/clipfrac_avg": 0.75, + "policy/entropy_avg": 0.6261385679244995, + "step": 2599, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003252029418945 + } + ], + "logging_steps": 500, + "max_steps": 7824, + "num_input_tokens_seen": 0, + "num_train_epochs": 9.0, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0, + "train_batch_size": null, + "trial_name": null, + "trial_params": null +}